# XGBoost on Classification Problems

In [90]:
import pandas as pd

rain = pd.read_csv('weatherAUS.csv')

rain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [91]:
rain['RainToday']=rain['RainToday'].map({'yes': 1, 'no': 0})

In [92]:
rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [93]:
cols_to_drop = ["Date", "Location", "RainTomorrow", "Rainfall"]

rain.drop(cols_to_drop, axis=1, inplace=True)

In [94]:

missing_props = rain.isna().mean(axis=0)

In [95]:
missing_props

MinTemp          0.010209
MaxTemp          0.008669
Evaporation      0.431665
Sunshine         0.480098
WindGustDir      0.070989
WindGustSpeed    0.070555
WindDir9am       0.072639
WindDir3pm       0.029066
WindSpeed9am     0.012148
WindSpeed3pm     0.021050
Humidity9am      0.018246
Humidity3pm      0.030984
Pressure9am      0.103568
Pressure3pm      0.103314
Cloud9am         0.384216
Cloud3pm         0.408071
Temp9am          0.012148
Temp3pm          0.024811
RainToday        1.000000
dtype: float64

In [96]:
over_threshold = missing_props[missing_props >= 0.4]

In [97]:
over_threshold

Evaporation    0.431665
Sunshine       0.480098
Cloud3pm       0.408071
RainToday      1.000000
dtype: float64

In [98]:
y=rain['RainToday']
X=rain.drop('RainToday',axis=1)


In [99]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

In [100]:
from sklearn.preprocessing import StandardScaler

numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

In [101]:
cat_cols = X.select_dtypes(exclude="number").columns
num_cols = X.select_dtypes(include="number").columns

In [102]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

In [103]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

print(type(xgb_cl))


<class 'xgboost.sklearn.XGBClassifier'>


In [106]:

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [112]:
y_train.to_string()

'22926    NaN\n80735    NaN\n121764   NaN\n139821   NaN\n1867     NaN\n59331    NaN\n79024    NaN\n36149    NaN\n128233   NaN\n121286   NaN\n96978    NaN\n45840    NaN\n1646     NaN\n127574   NaN\n1675     NaN\n115369   NaN\n25185    NaN\n67469    NaN\n4470     NaN\n94177    NaN\n123701   NaN\n23280    NaN\n124226   NaN\n11863    NaN\n12419    NaN\n49615    NaN\n21971    NaN\n65864    NaN\n3710     NaN\n133435   NaN\n66219    NaN\n85270    NaN\n38456    NaN\n55174    NaN\n40280    NaN\n138255   NaN\n34016    NaN\n51129    NaN\n67379    NaN\n102015   NaN\n138705   NaN\n10725    NaN\n122691   NaN\n28486    NaN\n110515   NaN\n122547   NaN\n94361    NaN\n58057    NaN\n52445    NaN\n117050   NaN\n134107   NaN\n134631   NaN\n37089    NaN\n23648    NaN\n131945   NaN\n95424    NaN\n117978   NaN\n41551    NaN\n113439   NaN\n47932    NaN\n120163   NaN\n11674    NaN\n94017    NaN\n8597     NaN\n218      NaN\n121620   NaN\n60291    NaN\n25659    NaN\n69136    NaN\n69944    NaN\n11667    NaN\n82785

In [None]:
from sklearn.metrics import accuracy_score

# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)