In [110]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score

In [41]:
data=pd.read_csv('dataset.csv')

In [42]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,18520826,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,18530903,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,18540907,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,18540908,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,18540908,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [43]:
len(data)

38489

In [44]:
data['Date']

0        18520826
1        18530903
2        18540907
3        18540908
4        18540908
           ...   
38484    20201120
38485    20201120
38486    20201120
38487    20201120
38488    20201121
Name: Date, Length: 38489, dtype: int64

In [45]:
#Converting Date to readable format
data['Date']=pd.to_datetime(data['Date'] , format='%Y%m%d')

In [46]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,1852-08-26,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,1853-09-03,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,1854-09-07,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,1854-09-08,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,1854-09-08,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [47]:
#Extracting Year and month from Date

In [48]:
data['Year']=data['Date'].apply(lambda date:date.year)
data['Month']=data['Date'].apply(lambda date:date.month)
data.drop(['Date'],axis=1,inplace=True)

In [49]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1852,8
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1853,9
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9


In [50]:
#Adding hemisphere column ( N,E- > 0 S,W-> 1)

In [51]:
def Convhemisphere(coord):
        hemisphere = re.findall(r'[NSWE]' , coord)[0]
        if hemisphere == 'N' or hemisphere == 'E':
            return 0
        else:
            return 1

In [52]:
data['Latitude_Hemisphere'] = data['Latitude'].apply(Convhemisphere)
data['Longitude_Hemisphere'] = data['Longitude'].apply(Convhemisphere)
data['Latitude_Hemisphere'] = data['Latitude_Hemisphere'].astype('category')
data['Longitude_Hemisphere'] = data['Longitude_Hemisphere'].astype('category')

In [53]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1852,8,0,1
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1853,9,0,1
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1


In [54]:
#Converting Latitude to numerical type

In [55]:
def ConvNum(coord):
        num = coord.translate({ ord(c): None for c in "NSEW" })
        return num

In [56]:
ConvNum(data['Latitude'][10])

' 42.5'

In [57]:
data['Latitude'][10].translate({ ord(c): None for c in "NS" })

' 42.5'

In [58]:
data['Latitude_num'] =  data['Latitude'].apply(ConvNum)
data['Longitude_num'] =   data['Longitude'].apply(ConvNum)

In [59]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [60]:
#Dropping latitude and longitude

In [61]:
data.drop(['Latitude','Longitude'],axis=1,inplace=True)

In [62]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [63]:
#Dropping columns with null

In [64]:
drop=['Column9','Column10','Column11','Column12','Column13','Column14','Column15','Column16','Column17','Column18','Column19','Column20']

In [65]:
data.drop(drop,axis=1,inplace=True)

In [66]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1


In [67]:
len(data)

38489

In [68]:
data_dup=data.copy()

In [69]:
data

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [70]:
Y=data['Status']

In [71]:
X=data
X.drop(['Status'],axis=1,inplace=True)

In [72]:
X.head()


Unnamed: 0,UTC,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,100,961,1852,8,0,1,30.2,88.6
1,1200,130,924,1853,9,0,1,19.7,56.2
2,1200,110,938,1854,9,0,1,28.0,78.6
3,1800,100,950,1854,9,0,1,31.6,81.1
4,2000,100,950,1854,9,0,1,31.7,81.1


In [73]:
Y.head()

0     HU
1     HU
2     HU
3     HU
4     HU
Name: Status, dtype: object

In [74]:
Y.shape

(38489,)

In [83]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score 

  from numpy.core.umath_tests import inner1d


In [84]:
model=RandomForestClassifier(n_estimators=500)
model.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [93]:
#Feature selection

In [94]:
features = pd.Series(model.feature_importances_ , index= X.columns).sort_values(ascending=False)

In [95]:
print(features)

MSWS                    0.495403
MinCentralPressure      0.198422
Latitude_num            0.107055
Year                    0.083805
Longitude_num           0.073028
Month                   0.028871
UTC                     0.009796
Longitude_Hemisphere    0.003621
Latitude_Hemisphere     0.000000
dtype: float64


In [96]:
#Now we choose the top 5 features 

In [97]:
data_dup

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [98]:
data_selected=data_dup[['MSWS','MinCentralPressure','Latitude_num','Year','Longitude_num','Status']]

In [99]:
data_selected.head()

Unnamed: 0,MSWS,MinCentralPressure,Latitude_num,Year,Longitude_num,Status
0,100,961,30.2,1852,88.6,HU
1,130,924,19.7,1853,56.2,HU
2,110,938,28.0,1854,78.6,HU
3,100,950,31.6,1854,81.1,HU
4,100,950,31.7,1854,81.1,HU


In [100]:
Y=data_selected['Status']
X=data_selected
X.drop(['Status'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
#Split test and train

In [101]:
X_train , X_test , Y_train, Y_test  = train_test_split(X,Y, test_size=0.1, random_state=32)

In [105]:
#Feature scaling
from sklearn.preprocessing import StandardScaler

In [106]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [107]:
model=RandomForestClassifier(n_estimators=1000)
model.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [108]:
prediction=model.predict(X_test)
print(np.array(Y_test))

[' TS' ' TD' ' TD' ... ' EX' ' TD' ' HU']


In [111]:
cm = confusion_matrix(Y_test, prediction)
print(cm)
accuracy_score(Y_test, prediction)

[[  32    0    0    3    0    0    0    2    1    0    0]
 [   0  191    8    4    1    1    0    9   26    0    0]
 [   0    1 1001    0    0    0    0    0    0    0    0]
 [   1    6    0  357    0    0    0   36    7    0    0]
 [   0    0    0    1   11    0    0   11    0    0    0]
 [   0    1    0    0    0   29    0    0   27    0    0]
 [   0    0    0    0    0    0    1    0    0    0    0]
 [   1    4    0   34    1    0    0  709    0    0    0]
 [   0   13    0    3    0    1    0    0 1304    0    0]
 [   0    0    0    0    0    0    0    0    0    4    0]
 [   0    0    0    0    0    0    0    0    1    0    6]]


0.9469992205767732

In [68]:
X_test

array([[ 0.1745114 ,  0.02412219, -1.11577736,  0.19253097,  0.53401927],
       [-0.76476573,  0.66532379, -1.05356645,  0.41163926,  0.61274919],
       [-0.76476573,  0.66532379, -1.11577736,  1.12374119,  1.40848376],
       ...,
       [ 0.1745114 , -0.08274475,  1.23786866, -1.28644997, -1.53826475],
       [-0.76476573,  0.66532379, -0.64919555,  0.5211934 ,  0.62399632],
       [ 1.48949938, -1.20484756,  1.66297653, -0.35523975, -0.57101142]])

In [112]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [113]:
svmprediction=classifier.predict(X_test)

In [114]:

cm = confusion_matrix(Y_test, svmprediction)
print(cm)
accuracy_score(Y_test,svmprediction)

[[   0    0    0   28    0    0    0    7    3    0    0]
 [   0  154   14   14    0    0    0   18   40    0    0]
 [   0    2 1000    0    0    0    0    0    0    0    0]
 [   0    7    0  304    0    0    0   80   16    0    0]
 [   0    1    0    2    1    0    0   19    0    0    0]
 [   0    7    0    0    0    0    0    0   50    0    0]
 [   0    0    1    0    0    0    0    0    0    0    0]
 [   0   14    0   87    1    0    0  647    0    0    0]
 [   0   25    0    0    0    0    0    0 1296    0    0]
 [   0    0    2    0    0    0    0    0    0    2    0]
 [   0    0    0    2    0    0    0    3    2    0    0]]


0.8843855546895297

In [73]:
np.unique(Y_test)

array([' DB', ' EX', ' HU', ' LO', ' SD', ' SS', ' ST', ' TD', ' TS',
       ' TY', ' WV'], dtype=object)