#Creating a Pipeline

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv(r"C:\Users\asusr\Documents\ML_datasets\weather_classification\weather_classification_data.csv")

In [3]:
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [18]:
df.isnull().sum()

Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64

In [19]:
from sklearn.pipeline import Pipeline

In [21]:
#preprocessing for numerical data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [22]:
numeric_processor=Pipeline(
    steps=[("imputation",SimpleImputer(missing_values=np.nan,strategy="mean")),
           ("scaler",StandardScaler())]
)

In [23]:
numeric_processor

In [24]:
from sklearn.preprocessing import OneHotEncoder

In [26]:
#for categorical data
categorical_processor=Pipeline(
    steps=[("imutation",SimpleImputer(fill_value="missing",strategy="constant")),
           ("onehot",OneHotEncoder())]
)

In [27]:
categorical_processor

combining to pipeline with column Transformer

In [28]:
from sklearn.compose import ColumnTransformer

In [46]:
processor=ColumnTransformer(
    [("numeri",numeric_processor,["Temperature","Humidity","Wind Speed","Precipitation (%)","Atmospheric Pressure","UV Index","Visibility (km)"]),
     ("categorical",categorical_processor,["Cloud Cover","Season","Location"])]
)

In [47]:
processor

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [49]:
pipe=make_pipeline(processor,LogisticRegression())

In [50]:
pipe

In [38]:
from sklearn.model_selection import train_test_split

In [51]:
x=df.drop(columns="Weather Type")
y=df["Weather Type"]

In [52]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [53]:
x_train

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location
10163,24.0,75,17.5,84.0,overcast,999.76,3,Summer,5.0,mountain
12929,46.0,69,3.5,43.0,cloudy,842.22,12,Spring,5.5,inland
5735,24.0,87,18.0,65.0,partly cloudy,1011.48,1,Spring,1.5,inland
440,1.0,102,35.0,90.0,overcast,988.38,5,Winter,4.0,mountain
2018,42.0,87,17.5,100.0,partly cloudy,1016.17,8,Summer,8.0,inland
...,...,...,...,...,...,...,...,...,...,...
11964,32.0,98,5.5,89.0,overcast,1006.51,2,Spring,2.5,coastal
5191,41.0,79,14.0,17.0,overcast,959.01,14,Summer,1.0,mountain
5390,34.0,52,7.0,13.0,overcast,1009.30,1,Summer,8.0,coastal
860,9.0,73,10.0,84.0,overcast,1000.06,14,Spring,8.0,mountain


In [54]:
y_train

10163     Rainy
12929    Cloudy
5735      Rainy
440       Snowy
2018     Cloudy
          ...  
11964     Rainy
5191      Rainy
5390     Cloudy
860      Cloudy
7270      Rainy
Name: Weather Type, Length: 9240, dtype: object

In [55]:
pipe.fit(x_train,y_train)

In [57]:
y_pre=pipe.predict(x_test)

In [58]:
y_pre

array(['Sunny', 'Sunny', 'Sunny', ..., 'Snowy', 'Sunny', 'Sunny'],
      dtype=object)

In [59]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [60]:
print("accuracy", accuracy_score(y_test,y_pre))

accuracy 0.8712121212121212


In [61]:
print(classification_report(y_test,y_pre))

              precision    recall  f1-score   support

      Cloudy       0.82      0.84      0.83       955
       Rainy       0.86      0.85      0.86       982
       Snowy       0.88      0.94      0.91      1033
       Sunny       0.93      0.86      0.89       990

    accuracy                           0.87      3960
   macro avg       0.87      0.87      0.87      3960
weighted avg       0.87      0.87      0.87      3960



In [62]:
print(confusion_matrix(y_test,y_pre))

[[801  87  34  33]
 [ 60 834  75  13]
 [ 34  13 968  18]
 [ 81  34  28 847]]
