In [65]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import re
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score

In [66]:
data=pd.read_csv('dataset.csv')

In [67]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,18520826,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,18530903,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,18540907,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,18540908,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,18540908,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [68]:
len(data)

38489

In [69]:
data['Date']

0        18520826
1        18530903
2        18540907
3        18540908
4        18540908
           ...   
38484    20201120
38485    20201120
38486    20201120
38487    20201120
38488    20201121
Name: Date, Length: 38489, dtype: int64

In [70]:
#Converting Date to readable format
data['Date']=pd.to_datetime(data['Date'] , format='%Y%m%d')

In [71]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,1852-08-26,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,1853-09-03,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,1854-09-07,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,1854-09-08,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,1854-09-08,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [72]:
#Extracting Year and month from Date

In [73]:
data['Year']=data['Date'].apply(lambda date:date.year)
data['Month']=data['Date'].apply(lambda date:date.month)
data.drop(['Date'],axis=1,inplace=True)

In [74]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1852,8
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1853,9
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9


In [75]:
#Adding hemisphere column ( N,E- > 0 S,W-> 1)

In [76]:
def Convhemisphere(coord):
        hemisphere = re.findall(r'[NSWE]' , coord)[0]
        if hemisphere == 'N' or hemisphere == 'E':
            return 0
        else:
            return 1

In [77]:
data['Latitude_Hemisphere'] = data['Latitude'].apply(Convhemisphere)
data['Longitude_Hemisphere'] = data['Longitude'].apply(Convhemisphere)
data['Latitude_Hemisphere'] = data['Latitude_Hemisphere'].astype('category')
data['Longitude_Hemisphere'] = data['Longitude_Hemisphere'].astype('category')

In [78]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1852,8,0,1
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1853,9,0,1
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1


In [81]:
#Converting Latitude to numerical type

In [82]:
def ConvNum(coord):
        num = coord.translate({ ord(c): None for c in "NSEW" })
        return num

In [83]:
ConvNum(data['Latitude'][10])

' 42.5'

In [84]:
data['Latitude'][10].translate({ ord(c): None for c in "NS" })

' 42.5'

In [85]:
data['Latitude_num'] =  data['Latitude'].apply(ConvNum)
data['Longitude_num'] =   data['Longitude'].apply(ConvNum)

In [86]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [87]:
#Dropping latitude and longitude

In [88]:
data.drop(['Latitude','Longitude'],axis=1,inplace=True)

In [89]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [90]:
#Dropping columns with null

In [91]:
drop=['Column9','Column10','Column11','Column12','Column13','Column14','Column15','Column16','Column17','Column18','Column19','Column20']

In [92]:
data.drop(drop,axis=1,inplace=True)

In [93]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1


In [94]:
len(data)

38489

In [95]:
data_dup=data.copy()

In [96]:
data

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [97]:
Y=data['Status']

In [98]:
X=data
X.drop(['Status'],axis=1,inplace=True)

In [99]:
X.head()


Unnamed: 0,UTC,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,100,961,1852,8,0,1,30.2,88.6
1,1200,130,924,1853,9,0,1,19.7,56.2
2,1200,110,938,1854,9,0,1,28.0,78.6
3,1800,100,950,1854,9,0,1,31.6,81.1
4,2000,100,950,1854,9,0,1,31.7,81.1


In [100]:
Y.head()

0     HU
1     HU
2     HU
3     HU
4     HU
Name: Status, dtype: object

In [101]:
Y.shape

(38489,)

In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score 

In [103]:
#model=RandomForestClassifier(n_estimators=500)
#model.fit(X,Y)

In [104]:
#Feature selection

In [105]:
#features = pd.Series(model.feature_importances_ , index= X.columns).sort_values(ascending=False)

In [106]:
#print(features)

In [107]:
#Now we choose the top 5 features 

In [108]:
data_dup

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [109]:
data_selected=data_dup[['MSWS','MinCentralPressure','Latitude_num','Year','Longitude_num','Status']]

In [110]:
data_selected.head()

Unnamed: 0,MSWS,MinCentralPressure,Latitude_num,Year,Longitude_num,Status
0,100,961,30.2,1852,88.6,HU
1,130,924,19.7,1853,56.2,HU
2,110,938,28.0,1854,78.6,HU
3,100,950,31.6,1854,81.1,HU
4,100,950,31.7,1854,81.1,HU


In [111]:
Y=data_selected['Status']
X=data_selected
X.drop(['Status'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [112]:
#Split test and train

In [113]:
X_train , X_test , Y_train, Y_test  = train_test_split(X,Y, test_size=0.2, random_state=32)

In [114]:
#Feature scaling
from sklearn.preprocessing import StandardScaler

In [115]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [188]:
from xgboost import XGBClassifier
classifier = XGBClassifier(
                      learning_rate=0.099,
    objective="multi:softmax",
    num_class=11,
                      colsample_bytree = 0.8,
                      subsample = 0.8,
                      n_estimators=360, 
                      max_depth=14,
                      gamma=0)


In [189]:
classifier.fit(X_train, Y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
       gamma=0, gpu_id=-1, importance_type=None,
       interaction_constraints='', learning_rate=0.099, max_delta_step=0,
       max_depth=14, min_child_weight=1, missing=nan,
       monotone_constraints='()', n_estimators=360, n_jobs=8, num_class=11,
       num_parallel_tree=1, objective='multi:softprob', predictor='auto',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
       subsample=0.8, tree_method='exact', use_label_encoder=True,
       validate_parameters=1, verbosity=None)

In [190]:
prediction=classifier.predict(X_test)
print(np.array(Y_test))

[' TS' ' TD' ' TD' ... ' TS' ' TD' ' EX']


  if diff:


In [191]:
cm = confusion_matrix(Y_test, prediction)
print(cm)
accuracy_score(Y_test, prediction)

[[  65    0    0   10    0    0    0    5    7    0    0]
 [   0  390   11    8    1    1    0   17   53    0    0]
 [   0    5 2021    0    0    0    0    0    0    1    0]
 [   6   13    0  726    0    0    0   62   17    0    0]
 [   0    2    0    1   35    0    0   19    0    0    0]
 [   0    5    0    0    0   65    0    0   43    0    0]
 [   0    0    0    0    0    0    0    0    0    2    0]
 [   3   13    0   80    6    0    0 1413    0    0    0]
 [   0   19    1    4    0    6    0    0 2539    0    0]
 [   0    0    0    0    0    0    0    0    0    9    0]
 [   0    0    0    1    0    0    0    3    2    0    8]]


0.9445310470252013

In [179]:
np.unique(Y_test)

array([' DB', ' EX', ' HU', ' LO', ' SD', ' SS', ' ST', ' TD', ' TS',
       ' TY', ' WV'], dtype=object)

In [None]:
fn=['MSWS','MinCentralPressure','Latitude_num','Year','Longitude_num']
cn=[' DB', ' EX', ' HU', ' LO', ' SD', ' SS', ' ST', ' TD', ' TS',
       ' TY', ' WV','wut']

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus



In [None]:
tree.export_graphviz(viz,
                     out_file="tree.dot",
                     feature_names = fn, 
                     class_names=cn,
                     filled = True)