In [1]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import re
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
data=pd.read_csv('dataset.csv')

In [3]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,18520826,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,18530903,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,18540907,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,18540908,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,18540908,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [4]:
len(data)

38489

In [5]:
data['Date']

0        18520826
1        18530903
2        18540907
3        18540908
4        18540908
           ...   
38484    20201120
38485    20201120
38486    20201120
38487    20201120
38488    20201121
Name: Date, Length: 38489, dtype: int64

In [6]:
#Converting Date to readable format
data['Date']=pd.to_datetime(data['Date'] , format='%Y%m%d')

In [7]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,1852-08-26,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,1853-09-03,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,1854-09-07,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,1854-09-08,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,1854-09-08,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [8]:
#Extracting Year and month from Date

In [9]:
data['Year']=data['Date'].apply(lambda date:date.year)
data['Month']=data['Date'].apply(lambda date:date.month)
data.drop(['Date'],axis=1,inplace=True)

In [10]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1852,8
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1853,9
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9


In [11]:
#Adding hemisphere column ( N,E- > 0 S,W-> 1)

In [12]:
def Convhemisphere(coord):
        hemisphere = re.findall(r'[NSWE]' , coord)[0]
        if hemisphere == 'N' or hemisphere == 'E':
            return 0
        else:
            return 1

In [13]:
data['Latitude_Hemisphere'] = data['Latitude'].apply(Convhemisphere)
data['Longitude_Hemisphere'] = data['Longitude'].apply(Convhemisphere)
data['Latitude_Hemisphere'] = data['Latitude_Hemisphere'].astype('category')
data['Longitude_Hemisphere'] = data['Longitude_Hemisphere'].astype('category')

In [14]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1852,8,0,1
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1853,9,0,1
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1


In [15]:
#Converting Latitude to numerical type

In [16]:
def ConvNum(coord):
        num = coord.translate({ ord(c): None for c in "NSEW" })
        return num

In [17]:
ConvNum(data['Latitude'][10])

' 42.5'

In [18]:
data['Latitude'][10].translate({ ord(c): None for c in "NS" })

' 42.5'

In [19]:
data['Latitude_num'] =  data['Latitude'].apply(ConvNum)
data['Longitude_num'] =   data['Longitude'].apply(ConvNum)

In [20]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [21]:
#Dropping latitude and longitude

In [22]:
data.drop(['Latitude','Longitude'],axis=1,inplace=True)

In [23]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [24]:
#Dropping columns with null

In [25]:
drop=['Column9','Column10','Column11','Column12','Column13','Column14','Column15','Column16','Column17','Column18','Column19','Column20']

In [26]:
data.drop(drop,axis=1,inplace=True)

In [27]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1


In [28]:
len(data)

38489

In [29]:
data_dup=data.copy()

In [30]:
data

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [31]:
Y=data['Status']

In [32]:
X=data
X.drop(['Status'],axis=1,inplace=True)

In [33]:
X.head()


Unnamed: 0,UTC,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,100,961,1852,8,0,1,30.2,88.6
1,1200,130,924,1853,9,0,1,19.7,56.2
2,1200,110,938,1854,9,0,1,28.0,78.6
3,1800,100,950,1854,9,0,1,31.6,81.1
4,2000,100,950,1854,9,0,1,31.7,81.1


In [34]:
Y.head()

0     HU
1     HU
2     HU
3     HU
4     HU
Name: Status, dtype: object

In [35]:
Y.shape

(38489,)

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score 

  from numpy.core.umath_tests import inner1d


In [37]:
#model=RandomForestClassifier(n_estimators=500)
#model.fit(X,Y)

In [38]:
#Feature selection

In [39]:
#features = pd.Series(model.feature_importances_ , index= X.columns).sort_values(ascending=False)

In [40]:
#print(features)

In [41]:
#Now we choose the top 5 features 

In [42]:
data_dup

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [43]:
data_selected=data_dup[['MSWS','MinCentralPressure','Latitude_num','Year','Longitude_num','Status']]

In [44]:
data_selected.head()

Unnamed: 0,MSWS,MinCentralPressure,Latitude_num,Year,Longitude_num,Status
0,100,961,30.2,1852,88.6,HU
1,130,924,19.7,1853,56.2,HU
2,110,938,28.0,1854,78.6,HU
3,100,950,31.6,1854,81.1,HU
4,100,950,31.7,1854,81.1,HU


In [45]:
Y=data_selected['Status']
X=data_selected
X.drop(['Status'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [46]:
#Split test and train

In [47]:
X_train , X_test , Y_train, Y_test  = train_test_split(X,Y, test_size=0.1, random_state=32)

In [48]:
#Feature scaling
from sklearn.preprocessing import StandardScaler

In [49]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [62]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 11, metric = 'minkowski', p = 1)
classifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=1,
           weights='uniform')

In [63]:
prediction=classifier.predict(X_test)
print(np.array(Y_test))

[' TS' ' TD' ' TD' ... ' EX' ' TD' ' HU']


In [64]:
cm = confusion_matrix(Y_test, prediction)
print(cm)
accuracy_score(Y_test, prediction)

[[  23    0    0    8    0    0    0    5    2    0    0]
 [   0  169   10    5    0    4    0   17   35    0    0]
 [   0    1  994    0    0    0    0    0    6    1    0]
 [   8    7    0  341    0    1    0   42    8    0    0]
 [   0    0    0    1    8    0    0   13    1    0    0]
 [   0    6    3    0    0   17    0    1   30    0    0]
 [   0    0    0    0    0    0    1    0    0    0    0]
 [   4   13    0   49    4    2    0  673    3    0    1]
 [   1   15    6   11    0    8    0    8 1272    0    0]
 [   0    0    0    0    0    0    0    0    0    4    0]
 [   1    0    0    1    0    0    0    1    2    0    2]]


0.9103663289166017

In [124]:
np.unique(Y_test)

array([' DB', ' EX', ' HU', ' LO', ' SD', ' SS', ' ST', ' TD', ' TS',
       ' TY', ' WV'], dtype=object)

In [128]:
fn=['MSWS','MinCentralPressure','Latitude_num','Year','Longitude_num']
cn=[' DB', ' EX', ' HU', ' LO', ' SD', ' SS', ' ST', ' TD', ' TS',
       ' TY', ' WV','wut']

In [129]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus



In [130]:
tree.export_graphviz(viz,
                     out_file="tree.dot",
                     feature_names = fn, 
                     class_names=cn,
                     filled = True)

IndexError: list index out of range