# Rainfall Prediction Project


## CSV getter

In [None]:
import csv

# Open the input text file and read its content
with open('usa.txt', 'r') as infile:
    data = infile.readlines()

# Process the data: Split each line by commas and store in a list
processed_data = [line.strip().split(',') for line in data]

# Write the processed data to a CSV file
with open('output.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerows(processed_data)

print("Conversion complete. 'output.csv' is created.")

Conversion complete. 'output.csv' is created.


## Data loading


In [None]:
import pandas as pd
df = pd.read_csv('/content/output.csv')
df.head()

Unnamed: 0,Date,Location,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow
0,2024-01-01,New York,87.524795,75.655455,28.379506,0.0,69.617966,1026.030278,0
1,2024-01-02,New York,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0
2,2024-01-03,New York,80.94305,64.740043,14.184831,0.916884,77.364763,980.796739,1
3,2024-01-04,New York,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0
4,2024-01-05,New York,37.059963,34.766784,3.689661,1.361272,85.584,1031.790859,0


Dividing year by month into 4 categories:

1. Winter
2. Summer
3. Monsoon
4. Spring


This method requires some more refinement to account for correct months being assigned the correct season.

In [None]:
df['Date'] = df['Date'].apply(lambda x : str(x))
df['Date'] = df['Date'].apply(lambda x : x[5:7])
df['Date'] = df['Date'].apply(lambda x : int(x))
df['Date'] = 1 + (df['Date'] // 4)
df

Unnamed: 0,Date,Location,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow
0,1,New York,87.524795,75.655455,28.379506,0.000000,69.617966,1026.030278,0
1,1,New York,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0
2,1,New York,80.943050,64.740043,14.184831,0.916884,77.364763,980.796739,1
3,1,New York,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0
4,1,New York,37.059963,34.766784,3.689661,1.361272,85.584000,1031.790859,0
...,...,...,...,...,...,...,...,...,...
73095,4,Washington D.C.,40.614393,65.099438,28.778327,0.000000,54.168514,977.083747,0
73096,4,Washington D.C.,52.641643,30.610525,12.282890,0.871000,22.068055,980.591675,0
73097,4,Washington D.C.,56.492591,96.740232,2.894762,1.191956,52.336048,1016.469174,1
73098,4,Washington D.C.,65.748956,63.900004,24.632400,0.483421,76.785280,1032.396146,1


Splitting US cities into Zones:

2: Cold (high chance of rain)


1: Moderate (some chance of rain)

0: Hot (low chance of rain)


In [None]:
df['Location'].value_counts()
location_map = {
    'Chicago': 2, 'Denver': 2, 'Indianapolis': 2,
    'Philadelphia': 2, 'Washington D.C.': 2, 'New York': 2, 'Seattle': 2,
    'Austin': 0, 'Dallas': 0, 'Fort Worth': 0,
    'Houston': 0, 'Jacksonville': 0, 'Phoenix': 0, 'San Antonio': 0,
    'Charlotte': 1, 'Columbus': 1, 'Los Angeles': 1,
    'San Diego': 1, 'San Francisco': 1, 'San Jose': 1
}
df['Location'] = df['Location'].map(location_map)

Calculating a 'rain score' for each city and time of year

In [None]:
df['Rain Score'] = df['Date'] + df['Location']
df = df.drop(columns=['Date'])
df = df.drop(columns=['Location'])
df
df['Rain Score']

Unnamed: 0,Rain Score
0,3
1,3
2,3
3,3
4,3
...,...
73095,6
73096,6
73097,6
73098,6


In [None]:
rain_tmrw = df['Rain Tomorrow']
rest.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Cloud Cover,Pressure
0,87.524795,75.655455,28.379506,69.617966,1026.030278
1,83.259325,28.712617,12.436433,41.606048,995.962065
2,80.94305,64.740043,14.184831,77.364763,980.796739
3,78.097552,59.738984,19.444029,52.541196,979.012163
4,37.059963,34.766784,3.689661,85.584,1031.790859


# Decision Tree

In [177]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
rest_train, rest_test, rain_tmrw_train, rain_tmrw_test = train_test_split(rest, rain_tmrw, test_size=0.2, random_state=100)
model.fit(rest_train, rain_tmrw_train)

In [178]:
pred = model.predict(rest_test)
from sklearn.metrics import accuracy_score, f1_score
accuracy_score(pred, rain_tmrw_test)

0.7577291381668947

In [179]:
f1_score(pred, rain_tmrw_test)

0.43902439024390244

# Logistic Regression

In [182]:
from sklearn.model_selection import train_test_split
rest_train, rest_test, rain_tmrw_train, rain_tmrw_test = train_test_split(rest, rain_tmrw, test_size=0.2, random_state=100)

In [183]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(class_weight="balanced")
model.fit(rest_train, rain_tmrw_train)
pred = model.predict(rest_test)
pred
f1_score(rain_tmrw_test, pred)

0.5670412818515633

In [184]:
from sklearn.metrics import accuracy_score, f1_score
accuracy_score(pred, rain_tmrw_test)

0.7338577291381669

In [185]:
from sklearn.metrics import log_loss
loss = log_loss(rain_tmrw_test, model.predict_proba(rest_test)[:, 1])
print("Log Loss:", loss)


Log Loss: 0.5292662695264799
