In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train = pd.read_csv('train.csv')

train.head()

Unnamed: 0,indexId,Scientific Name,Annual Mean Temperature,Mean Diurnal Range,Isothermality,Temperature Seasonality,Max Temperature of Warmest Month,Min Temperature of Coldest Month,Temperature Annual Range,Mean Temperature of Wettest Quarter,...,Mean Temperature of Warmest Quarter,Mean Temperature of Coldest Quarter,Annual Precipitation,Precipitation of Wettest Month,Precipitation of Driest Month,Precipitation Seasonality,Precipitation of Wettest Quarter,Precipitation of Driest Quarter,Precipitation of Warmest Quarter,Precipitation of Coldest Quarter
0,1,Cacatua haematuropygia,,,,,,,,,...,,,,,,,,,,
1,4,Anas luzonica,,,,,,,,,...,,,,,,,,,,
2,7,Cacatua haematuropygia,63.09,19.902,2432.0,25.639,117.493,420.0,7.148,25.357,...,178.0,896.0,11.33,26.975,52.334,31.232,53.0,305.0,26.028,23.993
3,8,Pithecophaga jeffeyri,66.983,20.849,2044.0,26.936,112.042,166.0,8.192,25.87,...,71.0,971.0,12.23,28.357,73.092,33.079,15.0,340.0,27.275,25.442
4,10,Cacatua haematuropygia,76.739,21.074,2113.0,26.943,63.116,569.0,9.091,27.528,...,429.0,663.0,11.847,27.757,26.627,32.921,135.0,298.0,27.095,26.139


In [2]:
# Label Encoding the target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(train['Scientific Name'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

train['y'] = le.transform(train['Scientific Name'])

train = train.drop('indexId', axis=1)

{'Alcedo argentata': 0, 'Anas luzonica': 1, 'Cacatua haematuropygia': 2, 'Egretta eulophotes': 3, 'Pithecophaga jeffeyri': 4}


In [3]:
train = train.dropna()

#Setting X and y
y = train['y']
X = train.drop(['Scientific Name', 'y'], axis = 1)

In [7]:
test = pd.read_csv('test.csv')

In [8]:
test_val = test.values

In [24]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

RAND = 11
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RAND)


data_dmatrix = xgb.DMatrix(data=X, label=y)

xg = xgb.XGBClassifier(objective ='reg:softmax', gamma= 0.2, learning_rate= 0.001, n_estimators = 200,
                      max_depth = 5, alpha = 10)

xg.fit(X_train, y_train)

preds = xg.predict(X_test)

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds, labels = [0, 1, 2, 3, 4], target_names = ['Alcedo argentata', 'Anas luzonica', 
                                                                                     'Cacatua haematuropygia', 
                                                                                     'Egretta eulophotes',
                                                                                     'Pithecophaga jeffeyri']))

                        precision    recall  f1-score   support

      Alcedo argentata       0.00      0.00      0.00         3
         Anas luzonica       0.50      0.38      0.43         8
Cacatua haematuropygia       0.60      0.46      0.52        13
    Egretta eulophotes       0.40      0.67      0.50         3
 Pithecophaga jeffeyri       0.44      0.64      0.52        11

             micro avg       0.47      0.47      0.47        38
             macro avg       0.39      0.43      0.39        38
          weighted avg       0.47      0.47      0.46        38



In [26]:
xg = xgb.XGBClassifier(objective ='reg:softmax', gamma= 0.2, learning_rate= 0.001, n_estimators = 200,
                      max_depth = 5, alpha = 10)

xg.fit(X, y)

preds = xg.predict(X)

print(classification_report(y, preds, labels = [0, 1, 2, 3, 4], target_names = ['Alcedo argentata', 'Anas luzonica', 
                                                                                     'Cacatua haematuropygia', 
                                                                                     'Egretta eulophotes',
                                                                                     'Pithecophaga jeffeyri']))

                        precision    recall  f1-score   support

      Alcedo argentata       0.90      0.53      0.67        17
         Anas luzonica       0.74      0.77      0.75        30
Cacatua haematuropygia       0.80      0.84      0.82        51
    Egretta eulophotes       0.92      0.60      0.73        20
 Pithecophaga jeffeyri       0.78      0.90      0.84        69

             micro avg       0.80      0.80      0.80       187
             macro avg       0.83      0.73      0.76       187
          weighted avg       0.81      0.80      0.79       187



In [27]:
def feature_eng(train):
    train['ave_temp'] = (train['Annual Mean Temperature'] - (train['Max Temperature of Warmest Month'] - 
                                                         train['Min Temperature of Coldest Month']))/train['Annual Mean Temperature']
    train['ave_prec'] = (train['Annual Precipitation'] - (train['Precipitation of Wettest Month'] - 
                                                         train['Precipitation of Driest Month']))/train['Annual Precipitation']
    train['ave_quar_prec'] = (train['Precipitation Seasonality'] - (train['Precipitation of Wettest Quarter'] - 
                                                         train['Precipitation of Driest Quarter']))/train['Precipitation Seasonality']
    train['ave_quar_cold'] = (train['Precipitation Seasonality'] - (train['Precipitation of Coldest Quarter'] - 
                                                         train['Precipitation of Warmest Quarter']))/train['Precipitation Seasonality']
    train['temp_range'] = (train['Temperature Annual Range'] - (train['Mean Temperature of Wettest Quarter'] - 
                                                         train['Mean Temperature of Driest Quarter']))/train['Temperature Annual Range']
    
    to_drop = ['Annual Mean Temperature', 'Max Temperature of Warmest Month', 'Min Temperature of Coldest Month',
          'Annual Precipitation', 'Precipitation of Wettest Month', 'Precipitation of Driest Month',
          'Precipitation Seasonality', 'Precipitation of Wettest Quarter', 'Precipitation of Driest Quarter',
          'Precipitation of Coldest Quarter', 'Precipitation of Warmest Quarter',
          'Temperature Annual Range', 'Mean Temperature of Wettest Quarter', 'Mean Temperature of Driest Quarter']

    a = train.drop(to_drop, axis=1)
    
    return a

In [28]:
train = feature_eng(train)

In [30]:
train = train.drop('Scientific Name', axis=1)

In [34]:
train.head()

Unnamed: 0,Mean Diurnal Range,Isothermality,Temperature Seasonality,Mean Temperature of Warmest Quarter,Mean Temperature of Coldest Quarter,y,ave_temp,ave_prec,ave_quar_prec,ave_quar_cold,temp_range
2,19.902,2432.0,25.639,178.0,896.0,2,5.794849,3.238217,9.068648,1.065158,62.086038
3,20.849,2044.0,26.936,71.0,971.0,4,1.805548,4.657809,10.824964,1.055413,48.989502
4,21.074,2113.0,26.943,429.0,663.0,2,7.592267,0.904617,5.951247,1.029039,62.761302
6,20.507,2066.0,26.344,303.0,687.0,4,4.404413,1.347611,5.226569,1.026902,57.337375
8,21.604,2273.0,27.424,54.0,1236.0,4,1.309843,5.795903,14.476364,1.042043,53.92276


In [32]:
X_fe = train.drop('y', axis=1)
y_fe = train['y']

In [33]:
RAND = 11
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RAND)


data_dmatrix = xgb.DMatrix(data=X, label=y)

xg = xgb.XGBClassifier(objective ='reg:softmax', gamma= 0.2, learning_rate= 0.001, n_estimators = 200,
                      max_depth = 5, alpha = 10)

xg.fit(X_train, y_train)

preds = xg.predict(X_test)

print(classification_report(y_test, preds, labels = [0, 1, 2, 3, 4], target_names = ['Alcedo argentata', 'Anas luzonica', 
                                                                                     'Cacatua haematuropygia', 
                                                                                     'Egretta eulophotes',
                                                                                     'Pithecophaga jeffeyri']))

                        precision    recall  f1-score   support

      Alcedo argentata       0.00      0.00      0.00         3
         Anas luzonica       0.50      0.38      0.43         8
Cacatua haematuropygia       0.60      0.46      0.52        13
    Egretta eulophotes       0.40      0.67      0.50         3
 Pithecophaga jeffeyri       0.44      0.64      0.52        11

             micro avg       0.47      0.47      0.47        38
             macro avg       0.39      0.43      0.39        38
          weighted avg       0.47      0.47      0.46        38



In [61]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pca_df = train.drop('y', axis = 1).dropna()

std_df = scaler.fit_transform(pca_df)
pca = PCA(n_components=10)
pc = pca.fit_transform(std_df)

train_pca = pd.DataFrame(pc, columns = ['PC-1', 'PC-2', 'PC-3', 'PC-4', 'PC-5',
                                       'PC-6', 'PC-7', 'PC-8', 'PC-9', 'PC-10'])
X = train_pca
y = train['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RAND, stratify=y)

from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

rfe = RFE(estimator=LogisticRegression(random_state=RAND), n_features_to_select=3, verbose=1)


rfe.fit(X_train, y_train)

print('RFE features used:')
print(X.columns[rfe.support_])

print('RFE features ranked:')
print(dict(zip(X.columns, rfe.ranking_)))

print('Classification Report:')
print(classification_report(y_test, rfe.predict(X_test)))

Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
RFE features used:
Index(['PC-5', 'PC-6', 'PC-7'], dtype='object')
RFE features ranked:
{'PC-1': 8, 'PC-2': 7, 'PC-3': 6, 'PC-4': 5, 'PC-5': 1, 'PC-6': 1, 'PC-7': 1, 'PC-8': 4, 'PC-9': 2, 'PC-10': 3}
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         9
           2       0.44      0.44      0.44        16
           3       0.00      0.00      0.00         6
           4       0.51      1.00      0.68        21

   micro avg       0.49      0.49      0.49        57
   macro avg       0.19      0.29      0.22        57
weighted avg       0.31      0.49      0.37        57



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [42]:
train_pca = pd.DataFrame(pc, columns = ['PC-1', 'PC-2', 'PC-3', 'PC-4', 'PC-5'])

In [43]:
train_pca.head()

Unnamed: 0,PC-1,PC-2,PC-3,PC-4,PC-5
0,-0.330153,-0.360775,1.280989,-0.200275,0.431103
1,-1.549068,1.25677,1.535915,-0.124748,0.580846
2,-0.580233,-0.962383,-1.594861,-0.461104,0.164886
3,-1.193364,-0.972938,-0.863031,-0.017379,-0.138775
4,-0.817361,2.754211,1.804303,-0.078663,0.344318


In [44]:
X = train_pca
y = train['y']

In [47]:
RAND = 11
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RAND)


data_dmatrix = xgb.DMatrix(data=X, label=y)

xg = xgb.XGBClassifier(objective ='reg:softmax', gamma= 0.2, learning_rate= 0.001, n_estimators = 200,
                      max_depth = 5, alpha = 10)

xg.fit(X_train, y_train)

preds = xg.predict(X_test)

print(classification_report(y_test, preds, labels = [0, 1, 2, 3, 4], target_names = ['Alcedo argentata', 'Anas luzonica', 
                                                                                     'Cacatua haematuropygia', 
                                                                                     'Egretta eulophotes',
                                                                                     'Pithecophaga jeffeyri']))

                        precision    recall  f1-score   support

      Alcedo argentata       0.00      0.00      0.00         3
         Anas luzonica       0.38      0.38      0.38         8
Cacatua haematuropygia       0.50      0.23      0.32        13
    Egretta eulophotes       0.40      0.67      0.50         3
 Pithecophaga jeffeyri       0.42      0.73      0.53        11

             micro avg       0.42      0.42      0.42        38
             macro avg       0.34      0.40      0.34        38
          weighted avg       0.40      0.42      0.38        38



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [49]:
pca.components_

array([[-0.16008916,  0.48242123, -0.15634156,  0.36318449,  0.4133926 ,
         0.40007214, -0.02401708,  0.27177805, -0.03183147,  0.42295512],
       [ 0.4481185 ,  0.07980335,  0.41625737, -0.2639781 ,  0.28423662,
         0.00317806,  0.47068839,  0.40474896, -0.27913036, -0.07604604],
       [-0.34151799, -0.01366194, -0.33301617, -0.34593427,  0.16319936,
        -0.32813104,  0.38466784,  0.30974464,  0.51871606,  0.07298683],
       [-0.29124024, -0.10746264, -0.50083941, -0.02178536,  0.00432471,
         0.22995433,  0.2052115 ,  0.13104773, -0.54808307, -0.48963065],
       [-0.10054876,  0.05165386,  0.29139092,  0.09271928,  0.17766698,
         0.39241941, -0.1319938 ,  0.10995881,  0.50068015, -0.65418945]])

In [69]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pca_df = train.drop('y', axis = 1).dropna()

std_df = scaler.fit_transform(pca_df)
pca = PCA(n_components=10)
pc = pca.fit_transform(std_df)

train_pca = pd.DataFrame(pc, columns = ['PC-1', 'PC-2', 'PC-3', 'PC-4', 'PC-5',
                                       'PC-6', 'PC-7', 'PC-8', 'PC-9', 'PC-10'])
X = train_pca
y = train['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RAND, stratify=y)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train, y_train)

print('Classification Report:')
print(classification_report(y_test, lr.predict(X_test)))

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       1.00      0.33      0.50         9
           2       0.42      0.50      0.46        16
           3       0.00      0.00      0.00         6
           4       0.55      0.86      0.67        21

   micro avg       0.51      0.51      0.51        57
   macro avg       0.39      0.34      0.32        57
weighted avg       0.48      0.51      0.45        57



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [73]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = train.drop('y', axis = 1).dropna()
y = train['y']

std_df = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(std_df, y, test_size = 0.2, random_state = RAND, stratify=y)

from sklearn.linear_model import LogisticRegression

lr_scale = LogisticRegression()

lr_scale.fit(X_train, y_train)

print(lr_scale.score(X_test, y_test))

print('Classification Report:')
print(classification_report(y_test, lr_scale.predict(X_test)))

0.4473684210526316
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       1.00      0.33      0.50         6
           2       0.40      0.40      0.40        10
           3       0.00      0.00      0.00         4
           4       0.48      0.79      0.59        14

   micro avg       0.45      0.45      0.45        38
   macro avg       0.38      0.30      0.30        38
weighted avg       0.44      0.45      0.40        38



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [142]:
train = pd.read_csv('train.csv')
train1 = train.interpolate(method='values')

# import numpy as np
# from statsmodels.imputation.mice import MICEData

# SimpleImputer(copy=True, fill_value=None,
#               missing_values=np.nan, strategy='mean', verbose=0)

# train2 = imp.fit_transform(train1)
# imp.transform(train1)

In [143]:
train1.describe()

Unnamed: 0,indexId,Annual Mean Temperature,Mean Diurnal Range,Isothermality,Temperature Seasonality,Max Temperature of Warmest Month,Min Temperature of Coldest Month,Temperature Annual Range,Mean Temperature of Wettest Quarter,Mean Temperature of Driest Quarter,Mean Temperature of Warmest Quarter,Mean Temperature of Coldest Quarter,Annual Precipitation,Precipitation of Wettest Month,Precipitation of Driest Month,Precipitation Seasonality,Precipitation of Wettest Quarter,Precipitation of Driest Quarter,Precipitation of Warmest Quarter,Precipitation of Coldest Quarter
count,241.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0
mean,403.983402,72.966699,20.535977,2385.292887,26.237073,90.21314,487.355649,8.237665,25.940941,552.073222,288.698745,921.328452,11.349659,27.308285,45.07478,31.885709,84.539749,340.100418,26.146946,25.062711
std,246.484516,7.037204,1.791577,564.049842,1.198141,32.13925,329.784849,1.047541,1.425185,140.167866,148.676685,289.950666,1.481615,1.126589,16.410842,1.075922,47.498003,113.319654,1.241753,1.488625
min,1.0,53.624,14.443,1317.0,21.49,30.813,40.0,4.733,20.87,266.0,28.0,384.0,7.0,22.796,13.115,28.1,7.0,145.0,19.741,19.741
25%,190.0,68.510667,19.69825,1930.5,25.787,65.66075,293.5,7.322167,25.3825,441.7,169.4,716.5,10.2475,26.8925,33.959,31.1565,47.5,268.0,25.736,24.39375
50%,398.0,72.574,20.998,2345.0,26.447,87.069,378.0,8.129,26.267,567.0,284.0,874.0,11.184,27.504,43.6225,32.031,84.5,320.0,26.416,25.387
75%,635.0,77.014,21.705,2710.0,27.101,107.63275,563.0,9.154,26.8855,659.0,370.0,1042.5,12.078,28.048,56.0585,32.6,112.0,380.0,27.0335,26.12325
max,826.0,89.591,24.1,4492.0,27.977,206.728,1981.0,10.346,28.5,926.0,651.0,2082.0,16.282,29.338,111.389,34.607,207.0,824.0,27.9,27.038


In [144]:
train1.head()

Unnamed: 0,indexId,Scientific Name,Annual Mean Temperature,Mean Diurnal Range,Isothermality,Temperature Seasonality,Max Temperature of Warmest Month,Min Temperature of Coldest Month,Temperature Annual Range,Mean Temperature of Wettest Quarter,...,Mean Temperature of Warmest Quarter,Mean Temperature of Coldest Quarter,Annual Precipitation,Precipitation of Wettest Month,Precipitation of Driest Month,Precipitation Seasonality,Precipitation of Wettest Quarter,Precipitation of Driest Quarter,Precipitation of Warmest Quarter,Precipitation of Coldest Quarter
0,1,Cacatua haematuropygia,,,,,,,,,...,,,,,,,,,,
1,4,Anas luzonica,,,,,,,,,...,,,,,,,,,,
2,7,Cacatua haematuropygia,63.09,19.902,2432.0,25.639,117.493,420.0,7.148,25.357,...,178.0,896.0,11.33,26.975,52.334,31.232,53.0,305.0,26.028,23.993
3,8,Pithecophaga jeffeyri,66.983,20.849,2044.0,26.936,112.042,166.0,8.192,25.87,...,71.0,971.0,12.23,28.357,73.092,33.079,15.0,340.0,27.275,25.442
4,10,Cacatua haematuropygia,76.739,21.074,2113.0,26.943,63.116,569.0,9.091,27.528,...,429.0,663.0,11.847,27.757,26.627,32.921,135.0,298.0,27.095,26.139


In [158]:
# Label Encoding the target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(train1['Scientific Name'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

train2['y'] = le.transform(train1['Scientific Name'])

train2 = train1.drop('indexId', axis=1)

{'Alcedo argentata': 0, 'Anas luzonica': 1, 'Cacatua haematuropygia': 2, 'Egretta eulophotes': 3, 'Pithecophaga jeffeyri': 4}


In [164]:
train2 = train2.dropna()

In [165]:
train2.head()

Unnamed: 0,Scientific Name,Annual Mean Temperature,Mean Diurnal Range,Isothermality,Temperature Seasonality,Max Temperature of Warmest Month,Min Temperature of Coldest Month,Temperature Annual Range,Mean Temperature of Wettest Quarter,Mean Temperature of Driest Quarter,...,Mean Temperature of Coldest Quarter,Annual Precipitation,Precipitation of Wettest Month,Precipitation of Driest Month,Precipitation Seasonality,Precipitation of Wettest Quarter,Precipitation of Driest Quarter,Precipitation of Warmest Quarter,Precipitation of Coldest Quarter,y
2,Cacatua haematuropygia,63.09,19.902,2432.0,25.639,117.493,420.0,7.148,25.357,462.0,...,896.0,11.33,26.975,52.334,31.232,53.0,305.0,26.028,23.993,2
3,Pithecophaga jeffeyri,66.983,20.849,2044.0,26.936,112.042,166.0,8.192,25.87,419.0,...,971.0,12.23,28.357,73.092,33.079,15.0,340.0,27.275,25.442,4
4,Cacatua haematuropygia,76.739,21.074,2113.0,26.943,63.116,569.0,9.091,27.528,589.0,...,663.0,11.847,27.757,26.627,32.921,135.0,298.0,27.095,26.139,2
5,Anas luzonica,81.3145,20.7905,2089.5,26.6435,57.3555,456.5,9.6585,26.711,595.5,...,675.0,11.877,27.402,28.9065,32.6675,116.0,266.0,26.8735,25.9595,1
6,Pithecophaga jeffeyri,85.89,20.507,2066.0,26.344,51.595,344.0,10.226,25.894,602.0,...,687.0,11.907,27.047,31.186,32.414,97.0,234.0,26.652,25.78,4


In [194]:
X = train2.drop(['Scientific Name', 'y'],axis=1)
y = train2['y']

In [172]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# pca_df = X

# std_df = scaler.fit_transform(pca_df)
# pca = PCA(n_components=10)
# pc = pca.fit_transform(std_df)

# train_pca = pd.DataFrame(pc, columns = ['PC-1', 'PC-2', 'PC-3', 'PC-4', 'PC-5',
#                                        'PC-6', 'PC-7', 'PC-8', 'PC-9', 'PC-10'])
# X = train_pca

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RAND, stratify=y)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train, y_train)

print('Classification Report:')
print(classification_report(y_test, lr.predict(X_test)))

print(lr.score(X_test, y_test))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.29      0.44         7
           1       0.40      0.13      0.20        15
           2       0.48      0.76      0.59        21
           3       0.00      0.00      0.00         7
           4       0.57      0.77      0.65        22

   micro avg       0.51      0.51      0.51        72
   macro avg       0.49      0.39      0.38        72
weighted avg       0.50      0.51      0.46        72

0.5138888888888888




In [173]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pca_df = X

std_df = scaler.fit_transform(pca_df)
pca = PCA(n_components=10)
pc = pca.fit_transform(std_df)

train_pca = pd.DataFrame(pc, columns = ['PC-1', 'PC-2', 'PC-3', 'PC-4', 'PC-5',
                                       'PC-6', 'PC-7', 'PC-8', 'PC-9', 'PC-10'])
X = train_pca

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RAND, stratify=y)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train, y_train)

print('Classification Report:')
print(classification_report(y_test, lr.predict(X_test)))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.14      0.25         7
           1       0.50      0.13      0.21        15
           2       0.47      0.81      0.60        21
           3       0.00      0.00      0.00         7
           4       0.55      0.77      0.64        22

   micro avg       0.51      0.51      0.51        72
   macro avg       0.50      0.37      0.34        72
weighted avg       0.51      0.51      0.44        72



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [183]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pca_df = X

std_df = scaler.fit_transform(pca_df)
pca = PCA(n_components=10)
pc = pca.fit_transform(std_df)

train_pca = pd.DataFrame(pc, columns = ['PC-1', 'PC-2', 'PC-3', 'PC-4', 'PC-5',
                                       'PC-6', 'PC-7', 'PC-8', 'PC-9', 'PC-10'])
X = train_pca

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RAND, stratify=y)

from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

rfe = RFE(estimator=LogisticRegression(random_state=RAND), n_features_to_select=18, verbose=1)


rfe.fit(X_train, y_train)

print('RFE features used:')
print(X.columns[rfe.support_])

print('RFE features ranked:')
print(dict(zip(X.columns, rfe.ranking_)))

print('Classification Report:')
print(classification_report(y_test, rfe.predict(X_test)))

RFE features used:
Index(['PC-1', 'PC-2', 'PC-3', 'PC-4', 'PC-5', 'PC-6', 'PC-7', 'PC-8', 'PC-9',
       'PC-10'],
      dtype='object')
RFE features ranked:
{'PC-1': 1, 'PC-2': 1, 'PC-3': 1, 'PC-4': 1, 'PC-5': 1, 'PC-6': 1, 'PC-7': 1, 'PC-8': 1, 'PC-9': 1, 'PC-10': 1}
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.29      0.44         7
           1       0.50      0.13      0.21        15
           2       0.45      0.81      0.58        21
           3       0.00      0.00      0.00         7
           4       0.54      0.68      0.60        22

   micro avg       0.50      0.50      0.50        72
   macro avg       0.50      0.38      0.37        72
weighted avg       0.50      0.50      0.44        72



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [195]:
RAND = 11
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RAND)


data_dmatrix = xgb.DMatrix(data=X, label=y)

xg = xgb.XGBClassifier(objective ='reg:softmax', gamma= 0.4, learning_rate= 0.001, n_estimators = 200, alpha = 0.1)

xg.fit(X_train, y_train)

preds = xg.predict(X_test)

print(classification_report(y_test, preds, labels = [0, 1, 2, 3, 4], target_names = ['Alcedo argentata', 'Anas luzonica', 
                                                                                     'Cacatua haematuropygia', 
                                                                                     'Egretta eulophotes',
                                                                                     'Pithecophaga jeffeyri']))

                        precision    recall  f1-score   support

      Alcedo argentata       1.00      0.14      0.25         7
         Anas luzonica       0.40      0.25      0.31         8
Cacatua haematuropygia       0.62      0.67      0.65        15
    Egretta eulophotes       0.57      0.80      0.67         5
 Pithecophaga jeffeyri       0.53      0.77      0.62        13

             micro avg       0.56      0.56      0.56        48
             macro avg       0.62      0.53      0.50        48
          weighted avg       0.61      0.56      0.53        48



In [196]:
test_val = test.values

In [197]:
test_val2 = test_val[:,1:]

In [204]:
# model = xgb.XGBClassifier(objective ='reg:softmax', gamma= 0.2, learning_rate= 0.001, n_estimators = 200)

# model.fit(X_res, y_res)

test = pd.read_csv('test.csv')

test1 = test.drop('indexId', axis=1)

pred_df = xg.predict(test1)

results_df = pd.DataFrame(data={'indexId':test['indexId'], 'preds':pred_df})
results_df['Predicted'] = le.inverse_transform(results_df['preds'])
results_df.to_csv('submission-random-grid-search-xgb.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'submission-random-grid-search-xgb.csv'