# Rainfall Prediction Project


## CSV getter

In [186]:
import csv

# Open the input text file and read its content
with open('usa.txt', 'r') as infile:
    data = infile.readlines()

# Process the data: Split each line by commas and store in a list
processed_data = [line.strip().split(',') for line in data]

# Write the processed data to a CSV file
with open('output.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerows(processed_data)

print("Conversion complete. 'output.csv' is created.")

Conversion complete. 'output.csv' is created.


## Data loading


In [187]:
import pandas as pd
df = pd.read_csv('/content/output.csv')
df.head()

Unnamed: 0,Date,Location,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow
0,2024-01-01,New York,87.524795,75.655455,28.379506,0.0,69.617966,1026.030278,0
1,2024-01-02,New York,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0
2,2024-01-03,New York,80.94305,64.740043,14.184831,0.916884,77.364763,980.796739,1
3,2024-01-04,New York,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0
4,2024-01-05,New York,37.059963,34.766784,3.689661,1.361272,85.584,1031.790859,0


Dividing year by month into 4 categories:

1. Winter
2. Summer
3. Monsoon
4. Spring


This method requires some more refinement to account for correct months being assigned the correct season.

In [188]:
df['Date'] = df['Date'].apply(lambda x : str(x))
df['Date'] = df['Date'].apply(lambda x : x[5:7])
df['Date'] = df['Date'].apply(lambda x : int(x))
df['Date'] = 1 + (df['Date'] // 4)
df

Unnamed: 0,Date,Location,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow
0,1,New York,87.524795,75.655455,28.379506,0.000000,69.617966,1026.030278,0
1,1,New York,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0
2,1,New York,80.943050,64.740043,14.184831,0.916884,77.364763,980.796739,1
3,1,New York,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0
4,1,New York,37.059963,34.766784,3.689661,1.361272,85.584000,1031.790859,0
...,...,...,...,...,...,...,...,...,...
73095,4,Washington D.C.,40.614393,65.099438,28.778327,0.000000,54.168514,977.083747,0
73096,4,Washington D.C.,52.641643,30.610525,12.282890,0.871000,22.068055,980.591675,0
73097,4,Washington D.C.,56.492591,96.740232,2.894762,1.191956,52.336048,1016.469174,1
73098,4,Washington D.C.,65.748956,63.900004,24.632400,0.483421,76.785280,1032.396146,1


Splitting US cities into Zones:

2: Cold (high chance of rain)


1: Moderate (some chance of rain)

0: Hot (low chance of rain)


In [189]:
df['Location'].value_counts()
location_map = {
    'Chicago': 2, 'Denver': 2, 'Indianapolis': 2,
    'Philadelphia': 2, 'Washington D.C.': 2, 'New York': 2, 'Seattle': 2,
    'Austin': 0, 'Dallas': 0, 'Fort Worth': 0,
    'Houston': 0, 'Jacksonville': 0, 'Phoenix': 0, 'San Antonio': 0,
    'Charlotte': 1, 'Columbus': 1, 'Los Angeles': 1,
    'San Diego': 1, 'San Francisco': 1, 'San Jose': 1
}
df['Location'] = df['Location'].map(location_map)

Calculating a 'rain score' for each city and time of year

In [204]:
df['Rain Score'] = (df['Precipitation'] * df['Humidity']) // 2
# df = df.drop(columns=['Date'])
# df = df.drop(columns=['Location'])
df
df['Rain Score']

Unnamed: 0,Rain Score
0,0.0
1,7.0
2,29.0
3,2.0
4,23.0
...,...
73095,0.0
73096,13.0
73097,57.0
73098,15.0


In [191]:
rain_tmrw = df['Rain Tomorrow']
rest = df.drop(columns=['Rain Tomorrow'])
rest

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Score
0,87.524795,75.655455,28.379506,0.000000,69.617966,1026.030278,3.0
1,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,10.0
2,80.943050,64.740043,14.184831,0.916884,77.364763,980.796739,32.0
3,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,5.0
4,37.059963,34.766784,3.689661,1.361272,85.584000,1031.790859,26.0
...,...,...,...,...,...,...,...
73095,40.614393,65.099438,28.778327,0.000000,54.168514,977.083747,6.0
73096,52.641643,30.610525,12.282890,0.871000,22.068055,980.591675,19.0
73097,56.492591,96.740232,2.894762,1.191956,52.336048,1016.469174,63.0
73098,65.748956,63.900004,24.632400,0.483421,76.785280,1032.396146,21.0


# Logistic Regression

In [205]:
from sklearn.model_selection import train_test_split
rest_train, rest_test, rain_tmrw_train, rain_tmrw_test = train_test_split(rest, rain_tmrw, test_size=0.2, random_state=100)

In [223]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, log_loss, recall_score, confusion_matrix, precision_score, classification_report
model = LogisticRegression(class_weight={0:1, 1:2}, solver='liblinear')
model.fit(rest_train, rain_tmrw_train)
pred = model.predict(rest_test)

F1-Score

In [224]:
f1_score(pred, rain_tmrw_test)

0.8904088767289862

Accuracy Score

In [225]:
accuracy_score(pred, rain_tmrw_test)

0.9506839945280438

Log Loss

In [230]:
log_loss(rain_tmrw_test, model.predict_proba(rest_test)[:, 1])

0.12854780220748485

Confusion Matrix

In [229]:
confusion_matrix(rain_tmrw_test, pred)

array([[10970,   416],
       [  305,  2929]])

Precision Score

In [228]:
precision_score(pred, rain_tmrw_test)

0.9056895485466914

Classification Report

In [227]:
print(classification_report(pred, rain_tmrw_test))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97     11275
           1       0.91      0.88      0.89      3345

    accuracy                           0.95     14620
   macro avg       0.93      0.92      0.93     14620
weighted avg       0.95      0.95      0.95     14620



Feature Importance

In [216]:
import numpy as np
coefficients = model.coef_[0]
feature_importance = np.abs(coefficients)
for feature, importance in zip(rest.columns, feature_importance):
    print(f"{feature}: {importance}")

Temperature: 0.015775952514613008
Humidity: 0.5349391065273075
Wind Speed: 0.010017562548813325
Precipitation: 6.359692346347937
Cloud Cover: 0.004804523113047583
Pressure: 0.04446823172390788
Rain Score: 9.768136582540262
