In [95]:
import pandas as pd
import numpy as np

df = pd.read_csv('C:/Users/Python notebooks/abalone.csv')

In [96]:
# total number of rows
len(df.iloc[:,0])

4177

In [97]:
df.dtypes

sex                object
Gender             object
length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight     object
viscera_weight    float64
shell_weight      float64
rings               int64
class               int64
dtype: object

In [98]:
df.nunique()

sex                  4
Gender               3
length             134
diameter           111
height              51
whole_weight      2428
shucked_weight    1515
viscera_weight     880
shell_weight       926
rings               28
class                2
dtype: int64

In [99]:
#Separating numerical & str features
df_str = df.select_dtypes(include=['object'])
df_num = df._get_numeric_data()

In [100]:
# Creating categorical features dataframe
#df_cat.replace(r'\s+', np.nan, regex=True)
df_str = df_str.apply(lambda x: x.str.strip())
df_str = df_str.replace('', np.nan, regex = True)
#df_cat = df_str.drop(['shucked_weight'], axis=1)

In [101]:
# Creating continuous features dataframe
df['shucked_weight'] = df['shucked_weight'].apply(pd.to_numeric, args=('coerce',))
#df_con['shucked_weight'] = pd.to_numeric(df['shucked_weight'])
df_con = pd.concat([df_num, df['shucked_weight'] ], axis=1, sort=False)

In [102]:
# Outlier treatment of continuous features
for col in df_con.columns.values:  
    Q1 = np.array(df_con[col].quantile([0.25]))[0]
    Q3 = np.array(df_con[col].quantile([0.75]))[0]
    IQR = Q3 - Q1
    df_con.loc[df_con[col] < Q1 - 1.5*IQR, col] = Q1 - 1.5*IQR
    df_con.loc[df_con[col] > Q3 + 1.5*IQR, col] = Q3 + 1.5*IQR

In [103]:
# Missing value imputation for categorical values
# Replace with mode if very less missing else create new category
df_cat.isna().sum()
df_cat['sex'] = df_cat['sex'].replace(np.nan, np.array(df_cat['sex'].mode())[0] , regex = True)
df_cat['Gender'] = df_cat['Gender'].replace(np.nan, 'No_value', regex = True)

In [104]:
# Missing value imputation for continuous values
df_con = df_con.fillna(df_con.mean())

In [106]:
df_con.isna().sum()

length            0
diameter          0
height            0
whole_weight      0
viscera_weight    0
shell_weight      0
rings             0
class             0
shucked_weight    0
dtype: int64

In [107]:
# Combining data
data = pd.concat([df_cat, df_con], axis = 1, sort = False)

In [109]:
data_x = data.drop(['class'], axis = 1)
data_y = data['class']

In [153]:
# train & test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=0, stratify= data_y)

In [157]:
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
 
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_train_ohe = ohe.fit_transform(X_train[df_cat.columns.values])
X_test_ohe = ohe.transform(X_test[df_cat.columns.values])
X_train_ohe.head()

Unnamed: 0,sex_I,sex_M,sex_F,Gender_Infant,Gender_Male,Gender_Female,Gender_No_value
2169,1,0,0,1,0,0,0
3295,0,1,0,0,1,0,0
2975,1,0,0,1,0,0,0
963,0,1,0,0,1,0,0
2084,0,0,1,0,0,1,0


In [158]:
X_train_con = X_train[df_con.drop(['class'], axis = 1).columns.values]
X_trainf = pd.concat([X_train_ohe, X_train_con], axis = 1, sort = False)

X_test_con = X_test[df_con.drop(['class'], axis = 1).columns.values]
X_testf = pd.concat([X_test_ohe, X_test_con], axis = 1, sort = False)

In [159]:
parameter = [{'max_depth': [1,10, 100, 200], 'min_samples_split': [2,5,10], 'min_samples_leaf': [1,5,10]}]
parameter

[{'max_depth': [1, 10, 100, 200],
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 5, 10]}]

In [160]:
######################## Random Forest ################################
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
parameter = [{'max_depth': [1,10, 100, 200], 'min_samples_split': [2,5,10], 'min_samples_leaf': [1,5,10]}]
model = GridSearchCV(RandomForestClassifier(class_weight= 'balanced'), parameter, scoring = 'precision', cv=5)
model.fit(X_trainf, Y_train)
print(model.best_estimator_)
print(model.score(X_testf, Y_test))

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=1, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.6027667984189723


In [161]:
rf = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=1, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
rf.fit(X_trainf, Y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=1, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [164]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_trainf.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
length,0.2
whole_weight,0.2
rings,0.2
shucked_weight,0.2
viscera_weight,0.1
shell_weight,0.1
sex_I,0.0
sex_M,0.0
sex_F,0.0
Gender_Infant,0.0


In [165]:
import xgboost as xgb

In [166]:
######################## XGBOOST ################################
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# parameter = [{'max_depth': [1,10, 100, 200], 'min_samples_split': [2,5,10], 'min_samples_leaf': [1,5,10]}]
# model = GridSearchCV(RandomForestClassifier(class_weight= 'balanced'), parameter, scoring = 'precision', cv=5)
# model.fit(X_trainf, Y_train)
# print(model.best_estimator_)
# print(model.score(X_testf, Y_test))

In [169]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_trainf, Y_train)
model.score(X_testf, Y_test)

  if diff:


0.5681818181818182

In [168]:
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X_trainf.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
shucked_weight,0.213946
whole_weight,0.172742
viscera_weight,0.152139
shell_weight,0.110935
diameter,0.098257
height,0.077655
length,0.069731
rings,0.066561
Gender_Infant,0.023772
sex_M,0.006339
