In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import datetime as dt
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv('dataset.csv')

In [3]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,18520826,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,18530903,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,18540907,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,18540908,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,18540908,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [4]:
len(data)

38489

In [5]:
data['Date']

0        18520826
1        18530903
2        18540907
3        18540908
4        18540908
           ...   
38484    20201120
38485    20201120
38486    20201120
38487    20201120
38488    20201121
Name: Date, Length: 38489, dtype: int64

In [6]:
#Converting Date to readable format
data['Date']=pd.to_datetime(data['Date'] , format='%Y%m%d')

In [7]:
data.head()

Unnamed: 0,Date,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20
0,1852-08-26,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,1853-09-03,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,1854-09-07,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,1854-09-08,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,1854-09-08,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [8]:
#Extracting Year and month from Date

In [9]:
data['Year']=data['Date'].apply(lambda date:date.year)
data['Month']=data['Date'].apply(lambda date:date.month)
data.drop(['Date'],axis=1,inplace=True)

In [10]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1852,8
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1853,9
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,1854,9


In [11]:
#Adding hemisphere column ( N,E- > 0 S,W-> 1)

In [12]:
def Convhemisphere(coord):
        hemisphere = re.findall(r'[NSWE]' , coord)[0]
        if hemisphere == 'N' or hemisphere == 'E':
            return 0
        else:
            return 1

In [13]:
data['Latitude_Hemisphere'] = data['Latitude'].apply(Convhemisphere)
data['Longitude_Hemisphere'] = data['Longitude'].apply(Convhemisphere)
data['Latitude_Hemisphere'] = data['Latitude_Hemisphere'].astype('category')
data['Longitude_Hemisphere'] = data['Longitude_Hemisphere'].astype('category')

In [14]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column15,Column16,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1852,8,0,1
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1853,9,0,1
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,1854,9,0,1


In [15]:
#Converting Latitude to numerical type

In [16]:
def ConvNum(coord):
        num = coord.translate({ ord(c): None for c in "NSEW" })
        return num

In [17]:
ConvNum(data['Latitude'][10])

' 42.5'

In [18]:
data['Latitude'][10].translate({ ord(c): None for c in "NS" })

' 42.5'

In [19]:
data['Latitude_num'] =  data['Latitude'].apply(ConvNum)
data['Longitude_num'] =   data['Longitude'].apply(ConvNum)

In [20]:
data.head()

Unnamed: 0,UTC,Status,Latitude,Longitude,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,30.2N,88.6W,100,961,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,19.7N,56.2W,130,924,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,28.0N,78.6W,110,938,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,31.6N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,31.7N,81.1W,100,950,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [21]:
#Dropping latitude and longitude

In [22]:
data.drop(['Latitude','Longitude'],axis=1,inplace=True)

In [23]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Column9,Column10,Column11,Column12,Column13,Column14,...,Column17,Column18,Column19,Column20,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,1854,9,0,1,31.7,81.1


In [24]:
#Dropping columns with null

In [27]:
drop=['Column9','Column10','Column11','Column12','Column13','Column14','Column15','Column16','Column17','Column18','Column19','Column20']

In [28]:
data.drop(drop,axis=1,inplace=True)

In [29]:
data.head()

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1


In [30]:
len(data)

38489

In [31]:
data_dup=data.copy()

In [32]:
data

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [33]:
Y=data['Status']

In [34]:
X=data
X.drop(['Status'],axis=1,inplace=True)

In [35]:
X.head()


Unnamed: 0,UTC,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,100,961,1852,8,0,1,30.2,88.6
1,1200,130,924,1853,9,0,1,19.7,56.2
2,1200,110,938,1854,9,0,1,28.0,78.6
3,1800,100,950,1854,9,0,1,31.6,81.1
4,2000,100,950,1854,9,0,1,31.7,81.1


In [36]:
Y.head()

0     HU
1     HU
2     HU
3     HU
4     HU
Name: Status, dtype: object

In [37]:
Y.shape

(38489,)

In [38]:
#Split test and train

In [39]:
X_train , X_test , Y_train, Y_test  = train_test_split(X,Y, test_size=0.2, random_state=32)

In [40]:
print(len(X_train),len(Y_train))
print(len(X_test),len(Y_test))

30791 30791
7698 7698


In [41]:
#Feature scaling

In [42]:
X_test1=X_test

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [43]:
print(X_test)

[[ 1.31225279  0.1748573   0.02376138 ...  0.08675213 -1.11454866
   0.53351466]
 [-1.35915686 -0.76560455  0.66587639 ...  0.08675213 -1.05226255
   0.61224755]
 [ 1.31225279 -0.76560455  0.66587639 ...  0.08675213 -1.11454866
   1.40801204]
 ...
 [ 0.4217829   0.36294968  0.02376138 ...  0.08675213 -1.23912087
  -0.23694283]
 [-1.35915686 -0.76560455  0.71938597 ...  0.08675213 -0.88616626
   0.48571256]
 [-0.46868698 -0.57751218  0.2913093  ...  0.08675213  1.64680214
  -0.39722048]]


In [44]:
print(X_test1)

        UTC  MSWS  MinCentralPressure  Year  Month Latitude_Hemisphere  \
28681  1800    55                 994  2003     10                   0   
30221     0    30                1006  2007      5                   0   
38067  1800    30                1006  2020      6                   0   
8490    600    30                1007  1995      8                   0   
9430   1200    35                1009  1997      6                   0   
...     ...   ...                 ...   ...    ...                 ...   
6551      0   120                 943  1989      9                   0   
21607  1800    70                 960  2020     10                   0   
36229  1200    60                 994  2016     11                   0   
23600     0    30                1007  1991      8                   0   
13023   600    35                 999  2004      9                   0   

      Longitude_Hemisphere Latitude_num Longitude_num  
28681                    1         12.2         107.8  

In [45]:
#Using random forest Classifier

In [46]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score 

  from numpy.core.umath_tests import inner1d


In [47]:
model=RandomForestClassifier(n_estimators=500)
model.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
prediction=model.predict(X_test)

In [49]:
print(np.array(Y_test))

[' TS' ' TD' ' TD' ... ' TS' ' TD' ' EX']


In [50]:
print(prediction)

[' TS' ' TD' ' TD' ... ' TS' ' TD' ' EX']


In [51]:
#print(np.concatenate((prediction.reshape(len(prediction),1), Y_test.reshape(len(Y_test),1)),1))

In [52]:
print(prediction[0])

 TS


In [53]:
print(accuracy_score(Y_test,prediction))

0.9428422967004416


In [322]:
#model.predict(sc.transform([[19690905,45,997]]))

In [54]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test, prediction)
print(cm)
accuracy_score(Y_test, prediction)

[[  64    0    0   12    0    0    0    5    6    0    0]
 [   0  375   18   14    1    1    0   16   56    0    0]
 [   0    2 2025    0    0    0    0    0    0    0    0]
 [   3   15    0  721    0    0    0   64   20    0    1]
 [   0    2    0    1   30    0    0   24    0    0    0]
 [   0    2    0    0    0   58    0    0   53    0    0]
 [   0    0    0    0    0    0    2    0    0    0    0]
 [   2   14    0   71    4    0    0 1424    0    0    0]
 [   0   19    1    2    0    4    0    0 2543    0    0]
 [   0    0    0    0    0    0    0    0    0    9    0]
 [   0    0    0    0    0    0    0    3    4    0    7]]


0.9428422967004416

In [55]:
#Feature selection

In [56]:
features = pd.Series(model.feature_importances_ , index= X_test1.columns).sort_values(ascending=False)

In [57]:
print(features)

MSWS                    0.487871
MinCentralPressure      0.207574
Latitude_num            0.106634
Year                    0.081899
Longitude_num           0.072841
Month                   0.028917
UTC                     0.010624
Longitude_Hemisphere    0.003640
Latitude_Hemisphere     0.000000
dtype: float64


In [58]:
#Now we choose the top 5 features 

In [59]:
data_dup

Unnamed: 0,UTC,Status,MSWS,MinCentralPressure,Year,Month,Latitude_Hemisphere,Longitude_Hemisphere,Latitude_num,Longitude_num
0,600,HU,100,961,1852,8,0,1,30.2,88.6
1,1200,HU,130,924,1853,9,0,1,19.7,56.2
2,1200,HU,110,938,1854,9,0,1,28.0,78.6
3,1800,HU,100,950,1854,9,0,1,31.6,81.1
4,2000,HU,100,950,1854,9,0,1,31.7,81.1
...,...,...,...,...,...,...,...,...,...,...
38484,0,LO,30,1008,2020,11,0,1,16.8,121.7
38485,600,LO,30,1008,2020,11,0,1,16.7,122.6
38486,1200,LO,30,1009,2020,11,0,1,16.7,123.7
38487,1800,LO,25,1010,2020,11,0,1,16.6,124.8


In [60]:
data_selected=data_dup[['MSWS','MinCentralPressure','Latitude_num','Year','Longitude_num','Status']]

In [61]:
data_selected.head()

Unnamed: 0,MSWS,MinCentralPressure,Latitude_num,Year,Longitude_num,Status
0,100,961,30.2,1852,88.6,HU
1,130,924,19.7,1853,56.2,HU
2,110,938,28.0,1854,78.6,HU
3,100,950,31.6,1854,81.1,HU
4,100,950,31.7,1854,81.1,HU


In [62]:
Y=data_selected['Status']
X=data_selected
X.drop(['Status'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [63]:
X_train , X_test , Y_train, Y_test  = train_test_split(X,Y, test_size=0.1, random_state=32)

In [64]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [65]:
model=RandomForestClassifier(n_estimators=1000)
model.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [66]:
prediction=model.predict(X_test)
print(np.array(Y_test))

[' TS' ' TD' ' TD' ... ' EX' ' TD' ' HU']


In [67]:
cm = confusion_matrix(Y_test, prediction)
print(cm)
accuracy_score(Y_test, prediction)

[[  32    0    0    3    0    0    0    2    1    0    0]
 [   0  190    8    4    1    1    0    9   27    0    0]
 [   0    1 1001    0    0    0    0    0    0    0    0]
 [   2    6    0  356    0    0    0   36    7    0    0]
 [   0    0    0    1   11    0    0   11    0    0    0]
 [   0    1    0    0    0   30    0    0   26    0    0]
 [   0    0    0    0    0    0    1    0    0    0    0]
 [   1    5    0   34    1    0    0  708    0    0    0]
 [   0   14    0    4    0    1    0    0 1302    0    0]
 [   0    0    0    0    0    0    0    0    0    4    0]
 [   0    0    0    0    0    0    0    0    1    0    6]]


0.9459599896076903

In [68]:
X_test

array([[ 0.1745114 ,  0.02412219, -1.11577736,  0.19253097,  0.53401927],
       [-0.76476573,  0.66532379, -1.05356645,  0.41163926,  0.61274919],
       [-0.76476573,  0.66532379, -1.11577736,  1.12374119,  1.40848376],
       ...,
       [ 0.1745114 , -0.08274475,  1.23786866, -1.28644997, -1.53826475],
       [-0.76476573,  0.66532379, -0.64919555,  0.5211934 ,  0.62399632],
       [ 1.48949938, -1.20484756,  1.66297653, -0.35523975, -0.57101142]])

In [347]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, Y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Random Forest Classification (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

ValueError: operands could not be broadcast together with shapes (835120,2) (5,) (835120,2) 

In [69]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [70]:
svmprediction=classifier.predict(X_test)

In [71]:

cm = confusion_matrix(Y_test, svmprediction)
print(cm)
accuracy_score(Y_test,svmprediction)

[[   0    0    0   28    0    0    0    7    3    0    0]
 [   0  154   14   14    0    0    0   18   40    0    0]
 [   0    2 1000    0    0    0    0    0    0    0    0]
 [   0    7    0  304    0    0    0   80   16    0    0]
 [   0    1    0    2    1    0    0   19    0    0    0]
 [   0    7    0    0    0    0    0    0   50    0    0]
 [   0    0    1    0    0    0    0    0    0    0    0]
 [   0   14    0   87    1    0    0  647    0    0    0]
 [   0   25    0    0    0    0    0    0 1296    0    0]
 [   0    0    2    0    0    0    0    0    0    2    0]
 [   0    0    0    2    0    0    0    3    2    0    0]]


0.8843855546895297

In [72]:
import tensorflow as tf
Y_test=np.array(Y_test)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [73]:
np.unique(Y_test)

array([' DB', ' EX', ' HU', ' LO', ' SD', ' SS', ' ST', ' TD', ' TS',
       ' TY', ' WV'], dtype=object)