In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = 'dev_in.csv'

df = pd.read_csv(data, header=None, sep=',')

In [3]:
# view dimensions of dataset

df.shape

(50001, 129)

In [4]:
# preview the dataset

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,fact_time,fact_latitude,fact_longitude,fact_temperature,fact_cwsm_class,climate,topography_bathymetry,sun_elevation,climate_temperature,climate_pressure,...,cmc_0_1_66_0_grad,cmc_0_1_66_0_next,cmc_0_1_67_0_grad,cmc_0_1_67_0_next,cmc_0_1_68_0_grad,cmc_0_1_68_0_next,gfs_2m_dewpoint_grad,gfs_2m_dewpoint_next,gfs_total_clouds_cover_low_grad,gfs_total_clouds_cover_low_next
1,1539162000,-40.35,-9.88,11.0,10,tropical,-843.0,31.78248998153052,10.07071428571432,765.6312283014016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.505035400390625,2.6475769042968977,2.0,2.0
2,1545006600,53.421299,-6.270070000000004,4.0,10,mild temperate,67.0,-59.69152145231892,7.005000000000032,752.8976150963312,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.4000244140625,1.2499938964843975,0.0,0.0
3,1540094400,-19.7577,63.361,26.0,10,dry,6.0,35.250889085578635,23.32714285714289,763.1150163007375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.100006103515625,21.05001220703128,-1.0,1.0
4,1552611600,35.245899,47.009201,5.0,10,mild temperate,1390.0,-23.755614985282445,3.1092857142857446,609.4193331163475,...,1.69672,5.1653,0.0,4.8750000000000006e-05,0.0,0.0,-1.5,-0.3499816894531022,-12.0,81.0


In [5]:
df.columns

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            119, 120, 121, 122, 123, 124, 125, 126, 127, 128],
           dtype='int64', length=129)

In [6]:
# setting column names as per the dataset
header_row = 0
df.columns = df.iloc[header_row]
df = df.tail(-1)

In [7]:
df.columns

Index(['fact_time', 'fact_latitude', 'fact_longitude', 'fact_temperature',
       'fact_cwsm_class', 'climate', 'topography_bathymetry', 'sun_elevation',
       'climate_temperature', 'climate_pressure',
       ...
       'cmc_0_1_66_0_grad', 'cmc_0_1_66_0_next', 'cmc_0_1_67_0_grad',
       'cmc_0_1_67_0_next', 'cmc_0_1_68_0_grad', 'cmc_0_1_68_0_next',
       'gfs_2m_dewpoint_grad', 'gfs_2m_dewpoint_next',
       'gfs_total_clouds_cover_low_grad', 'gfs_total_clouds_cover_low_next'],
      dtype='object', name=0, length=129)

In [8]:
# just swapping the pridiction column to last column
col_list = list(df.columns)
x, y = col_list.index('climate'), col_list.index('gfs_total_clouds_cover_low_next')
col_list[y], col_list[x] = col_list[x], col_list[y]
df = df[col_list]

In [9]:
df.columns

Index(['fact_time', 'fact_latitude', 'fact_longitude', 'fact_temperature',
       'fact_cwsm_class', 'gfs_total_clouds_cover_low_next',
       'topography_bathymetry', 'sun_elevation', 'climate_temperature',
       'climate_pressure',
       ...
       'cmc_0_1_66_0_grad', 'cmc_0_1_66_0_next', 'cmc_0_1_67_0_grad',
       'cmc_0_1_67_0_next', 'cmc_0_1_68_0_grad', 'cmc_0_1_68_0_next',
       'gfs_2m_dewpoint_grad', 'gfs_2m_dewpoint_next',
       'gfs_total_clouds_cover_low_grad', 'climate'],
      dtype='object', name=0, length=129)

In [10]:
# view summary of dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 1 to 50000
Columns: 129 entries, fact_time to climate
dtypes: object(129)
memory usage: 49.2+ MB


In [11]:
# x is features y is result
X = df.drop(['climate'], axis=1)

y = df['climate']

In [12]:
# split X and y into training and testing sets
# here we hav put 70% train data and 30% test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [13]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((35000, 128), (15000, 128))

In [14]:
# check data types in X_train

X_train.dtypes

0
fact_time                          object
fact_latitude                      object
fact_longitude                     object
fact_temperature                   object
fact_cwsm_class                    object
                                    ...  
cmc_0_1_68_0_grad                  object
cmc_0_1_68_0_next                  object
gfs_2m_dewpoint_grad               object
gfs_2m_dewpoint_next               object
gfs_total_clouds_cover_low_grad    object
Length: 128, dtype: object

In [15]:
# display categorical variables
# preprossing starts

categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']


In [16]:
# print percentage of missing values in the categorical variables in training set

X_train[categorical].isnull().mean()

0
fact_time                          0.000000
fact_latitude                      0.000000
fact_longitude                     0.000000
fact_temperature                   0.000000
fact_cwsm_class                    0.000000
                                     ...   
cmc_0_1_68_0_grad                  0.000714
cmc_0_1_68_0_next                  0.000714
gfs_2m_dewpoint_grad               0.000114
gfs_2m_dewpoint_next               0.000057
gfs_total_clouds_cover_low_grad    0.000114
Length: 128, dtype: float64

In [17]:
# print categorical variables with missing data

for col in categorical:
    if X_train[col].isnull().mean()>0:
        print(col, (X_train[col].isnull().mean()))

gfs_total_clouds_cover_low_next 5.714285714285714e-05
cmc_0_0_0_1000 0.0007142857142857143
cmc_0_0_0_2_grad 0.0007142857142857143
cmc_0_0_0_2_interpolated 0.0007142857142857143
cmc_0_0_0_2_next 0.0007142857142857143
cmc_0_0_0_2 0.0007142857142857143
cmc_0_0_0_500 0.0007142857142857143
cmc_0_0_0_700 0.0007142857142857143
cmc_0_0_0_850 0.0007142857142857143
cmc_0_0_0_925 0.0007142857142857143
cmc_0_0_6_2 0.0007142857142857143
cmc_0_0_7_1000 0.0007142857142857143
cmc_0_0_7_2 0.0007142857142857143
cmc_0_0_7_500 0.0007142857142857143
cmc_0_0_7_700 0.0007142857142857143
cmc_0_0_7_850 0.0007142857142857143
cmc_0_0_7_925 0.0007142857142857143
cmc_0_1_0_0 0.0007142857142857143
cmc_0_1_11_0 0.0007142857142857143
cmc_0_1_65_0 0.0007142857142857143
cmc_0_1_66_0 0.0007142857142857143
cmc_0_1_67_0 0.0007142857142857143
cmc_0_1_68_0 0.0007142857142857143
cmc_0_1_7_0 0.0007142857142857143
cmc_0_2_2_10 0.0007142857142857143
cmc_0_2_2_1000 0.0007142857142857143
cmc_0_2_2_500 0.0007142857142857143
cmc_0_

In [18]:
# impute missing categorical variables with most frequent value

for col in categorical:
    if X_train[col].isnull().mean()>0:
        for df2 in [X_train, X_test]:
            df2[col].fillna(X_train[col].mode()[0], inplace=True) 

In [19]:
# check missing values in categorical variables in X_train

X_train[categorical].isnull().sum()

0
fact_time                          0
fact_latitude                      0
fact_longitude                     0
fact_temperature                   0
fact_cwsm_class                    0
                                  ..
cmc_0_1_68_0_grad                  0
cmc_0_1_68_0_next                  0
gfs_2m_dewpoint_grad               0
gfs_2m_dewpoint_next               0
gfs_total_clouds_cover_low_grad    0
Length: 128, dtype: int64

In [20]:
# check missing values in categorical variables in X_test

X_test[categorical].isnull().sum()

0
fact_time                          0
fact_latitude                      0
fact_longitude                     0
fact_temperature                   0
fact_cwsm_class                    0
                                  ..
cmc_0_1_68_0_grad                  0
cmc_0_1_68_0_next                  0
gfs_2m_dewpoint_grad               0
gfs_2m_dewpoint_next               0
gfs_total_clouds_cover_low_grad    0
Length: 128, dtype: int64

In [21]:
# check missing values in X_train

X_train.isnull().sum()

0
fact_time                          0
fact_latitude                      0
fact_longitude                     0
fact_temperature                   0
fact_cwsm_class                    0
                                  ..
cmc_0_1_68_0_grad                  0
cmc_0_1_68_0_next                  0
gfs_2m_dewpoint_grad               0
gfs_2m_dewpoint_next               0
gfs_total_clouds_cover_low_grad    0
Length: 128, dtype: int64

In [22]:
# check missing values in X_test

X_test.isnull().sum()

0
fact_time                          0
fact_latitude                      0
fact_longitude                     0
fact_temperature                   0
fact_cwsm_class                    0
                                  ..
cmc_0_1_68_0_grad                  0
cmc_0_1_68_0_next                  0
gfs_2m_dewpoint_grad               0
gfs_2m_dewpoint_next               0
gfs_total_clouds_cover_low_grad    0
Length: 128, dtype: int64

In [113]:
# print categorical variables

categorical

['fact_time',
 'fact_latitude',
 'fact_longitude',
 'fact_temperature',
 'fact_cwsm_class',
 'gfs_total_clouds_cover_low_next',
 'topography_bathymetry',
 'sun_elevation',
 'climate_temperature',
 'climate_pressure',
 'cmc_0_0_0_1000',
 'cmc_0_0_0_2_grad',
 'cmc_0_0_0_2_interpolated',
 'cmc_0_0_0_2_next',
 'cmc_0_0_0_2',
 'cmc_0_0_0_500',
 'cmc_0_0_0_700',
 'cmc_0_0_0_850',
 'cmc_0_0_0_925',
 'cmc_0_0_6_2',
 'cmc_0_0_7_1000',
 'cmc_0_0_7_2',
 'cmc_0_0_7_500',
 'cmc_0_0_7_700',
 'cmc_0_0_7_850',
 'cmc_0_0_7_925',
 'cmc_0_1_0_0',
 'cmc_0_1_11_0',
 'cmc_0_1_65_0',
 'cmc_0_1_66_0',
 'cmc_0_1_67_0',
 'cmc_0_1_68_0',
 'cmc_0_1_7_0',
 'cmc_0_2_2_10',
 'cmc_0_2_2_1000',
 'cmc_0_2_2_500',
 'cmc_0_2_2_700',
 'cmc_0_2_2_850',
 'cmc_0_2_2_925',
 'cmc_0_2_3_10',
 'cmc_0_2_3_1000',
 'cmc_0_2_3_500',
 'cmc_0_2_3_700',
 'cmc_0_2_3_850',
 'cmc_0_2_3_925',
 'cmc_0_3_0_0',
 'cmc_0_3_0_0_next',
 'cmc_0_3_1_0',
 'cmc_0_3_5_1000',
 'cmc_0_3_5_500',
 'cmc_0_3_5_700',
 'cmc_0_3_5_850',
 'cmc_0_3_5_925',
 'cmc

In [23]:
X_train[categorical].head()

Unnamed: 0,fact_time,fact_latitude,fact_longitude,fact_temperature,fact_cwsm_class,gfs_total_clouds_cover_low_next,topography_bathymetry,sun_elevation,climate_temperature,climate_pressure,...,cmc_0_1_65_0_next,cmc_0_1_66_0_grad,cmc_0_1_66_0_next,cmc_0_1_67_0_grad,cmc_0_1_67_0_next,cmc_0_1_68_0_grad,cmc_0_1_68_0_next,gfs_2m_dewpoint_grad,gfs_2m_dewpoint_next,gfs_total_clouds_cover_low_grad
17968,1548024840,33.4608,-111.727997,23.0,0,0.0,415.0,25.435328,14.219286,724.660785,...,0.0021,0.0,0.0,0.0,0.0,0.0,0.0,-1.659729,-5.649847,0.0
32392,1546131000,38.1511,21.4256,4.0,10,0.0,13.0,-66.272332,9.300714,755.073676,...,0.0,0.0,0.0,0.012594,0.016259,0.0,0.0,-0.126343,3.050012,0.0
9342,1554130800,41.060556,14.078889,17.0,10,0.0,6.0,26.09395,16.555714,760.032923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.104095,7.584283,0.0
7930,1542898800,43.907299,4.90183,15.0,0,1.0,43.0,10.729241,14.544286,754.439604,...,20.5697,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,8.450006,-1.0
46545,1536051600,40.65,-3.74,25.0,10,0.0,792.0,34.232497,16.871429,699.437286,...,1.55355,0.0,0.0,0.0,0.0,0.0,0.0,-2.099976,7.150018,0.0


In [24]:
cols = X_train.columns

In [116]:
# from sklearn.preprocessing import RobustScaler

# scaler = RobustScaler()

# X_train = scaler.fit_transform(X_train)

# X_test = scaler.transform(X_test)

In [117]:
# just rechanging the format to orignal to avoid mismatch errors
# X_train = pd.DataFrame(X_train, columns=[cols])

In [118]:
# X_test = pd.DataFrame(X_test, columns=[cols])

In [119]:
# X_train.head()

In [25]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train, y_train)

In [26]:
y_pred = gnb.predict(X_test)

y_pred

array(['tropical', 'mild temperate', 'mild temperate', ...,
       'mild temperate', 'tropical', 'mild temperate'], dtype='<U14')

In [27]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.6981


In [28]:
y_pred_train = gnb.predict(X_train)

y_pred_train

array(['mild temperate', 'mild temperate', 'tropical', ..., 'tropical',
       'mild temperate', 'mild temperate'], dtype='<U14')

In [29]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

Training-set accuracy score: 0.6943


In [30]:
# print the scores on training and test set

print('Training set accuracy: {:.4f}'.format(gnb.score(X_train, y_train)*100))

print('Test set accuracy: {:.4f}'.format(gnb.score(X_test, y_test)*100))


Training set accuracy: 69.4286
Test set accuracy: 69.8133
