In [53]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import r2_score 
import numpy as np

In [3]:
#Q1: Loading the data into pandas
data = pd.read_csv(r"C:\Users\91700\OneDrive\Desktop\weather.csv")
print(data.head())

   MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  \
0      8.0     24.3       0.0          3.4       6.3          NW   
1     14.0     26.9       3.6          4.4       9.7         ENE   
2     13.7     23.4       3.6          5.8       3.3          NW   
3     13.3     15.5      39.8          7.2       9.1          NW   
4      7.6     16.1       2.8          5.6      10.6         SSE   

   WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  ...  Humidity3pm  \
0           30.0         SW         NW           6.0  ...           29   
1           39.0          E          W           4.0  ...           36   
2           85.0          N        NNE           6.0  ...           69   
3           54.0        WNW          W          30.0  ...           56   
4           50.0        SSE        ESE          20.0  ...           49   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1019.7       1015.0         7         7     14.4     23.6 

In [4]:
#Q2: Number of null values
null_counts = data.isnull().sum()
print("Number of null values:\n", null_counts)

Number of null values:
 MinTemp           0
MaxTemp           0
Rainfall          0
Evaporation       0
Sunshine          3
WindGustDir       3
WindGustSpeed     2
WindDir9am       31
WindDir3pm        1
WindSpeed9am      7
WindSpeed3pm      0
Humidity9am       0
Humidity3pm       0
Pressure9am       0
Pressure3pm       0
Cloud9am          0
Cloud3pm          0
Temp9am           0
Temp3pm           0
RainToday         0
RISK_MM           0
RainTomorrow      0
dtype: int64


In [7]:
#Q3: Dropping null values
data = data.dropna()
print(data)

     MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  \
0        8.0     24.3       0.0          3.4       6.3          NW   
1       14.0     26.9       3.6          4.4       9.7         ENE   
2       13.7     23.4       3.6          5.8       3.3          NW   
3       13.3     15.5      39.8          7.2       9.1          NW   
4        7.6     16.1       2.8          5.6      10.6         SSE   
..       ...      ...       ...          ...       ...         ...   
361      9.0     30.7       0.0          7.6      12.1         NNW   
362      7.1     28.4       0.0         11.6      12.7           N   
363     12.5     19.9       0.0          8.4       5.3         ESE   
364     12.5     26.9       0.0          5.0       7.1          NW   
365     12.3     30.2       0.0          6.0      12.6          NW   

     WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  ...  Humidity3pm  \
0             30.0         SW         NW           6.0  ...           29   
1      

In [19]:
#Q4: Label encoding string values
label_columns = ["WindGustDir", "WindDir9am", "WindDir3pm","RainToday", "RainTomorrow"]  
label_encoder = LabelEncoder()

for col in label_columns:
    data[col] = label_encoder.fit_transform(data[col])
print(data[col])


0      1
1      1
2      1
3      1
4      0
      ..
361    0
362    0
363    0
364    0
365    0
Name: RainTomorrow, Length: 328, dtype: int64


In [20]:
#Q5: Converting string values to numbers
numeric_columns = data.select_dtypes(include=['number']).columns
data_numeric = data[numeric_columns]
print(data_numeric)

     MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustDir  \
0        8.0     24.3       0.0          3.4       6.3            7   
1       14.0     26.9       3.6          4.4       9.7            1   
2       13.7     23.4       3.6          5.8       3.3            7   
3       13.3     15.5      39.8          7.2       9.1            7   
4        7.6     16.1       2.8          5.6      10.6           10   
..       ...      ...       ...          ...       ...          ...   
361      9.0     30.7       0.0          7.6      12.1            6   
362      7.1     28.4       0.0         11.6      12.7            3   
363     12.5     19.9       0.0          8.4       5.3            2   
364     12.5     26.9       0.0          5.0       7.1            7   
365     12.3     30.2       0.0          6.0      12.6            7   

     WindGustSpeed  WindDir9am  WindDir3pm  WindSpeed9am  ...  Humidity3pm  \
0             30.0          12           7           6.0  ...        

In [60]:
#Q6: Applying MinMax scaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_numeric)
print(data_scaled)

[[0.50763359 0.59219858 0.         ... 0.         0.09045226 1.        ]
 [0.73664122 0.68439716 0.09045226 ... 1.         0.09045226 1.        ]
 [0.72519084 0.56028369 0.09045226 ... 1.         1.         1.        ]
 ...
 [0.67938931 0.43617021 0.         ... 0.         0.         0.        ]
 [0.67938931 0.68439716 0.         ... 0.         0.         0.        ]
 [0.67175573 0.80141844 0.         ... 0.         0.         0.        ]]


In [85]:
features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                   'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
                   'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
                   'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']

target= 'RainTomorrow'

X = data[features]
y = data[target]

In [86]:
#Q7: Applying Linear regression
linear_reg_model.fit(X, y)
y_pred = linear_reg_model.predict(X)

#Q8: r2 score
r2_original = r2_score(y, y_pred)
print("R2 value without cross-validation:", r2_original)

R2 value without cross-validation: 0.4398775191444345


In [90]:
#Q9:
cv_scores=cross_val_score(linear_reg_model,X,y,cv=6,scoring='r2')
print(cv_scores)

[ 0.23361064  0.24657374  0.58458409 -0.16249807  0.49897211  0.44872029]
