In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [18]:
class Model:
    def __init__(self,data):                      # constructor
        self.df = pd.read_csv(data)               # self.df = instance varible 
#         print(self.df.head())
        self.m1 = LogisticRegression(max_iter=1000)
        self.m2 = DecisionTreeClassifier(criterion='gini',max_depth=4,min_samples_split=15)
        self.m3 = RandomForestClassifier(n_estimators=80,criterion='gini',max_depth=4,
                                         min_samples_split=15)
        self.m4 = KNeighborsClassifier(n_neighbors=7)
        
    def select_model(self,model):
        self.mod_name = model                          # self.mod_name = instance_variable
        if self.mod_name == 'LogReg':
            self.model = self.m1                       # self.model = instance variable
        elif self.mod_name == 'DT':
            self.model = self.m2
        elif self.mod_name == 'RF':
            self.model = self.m3
        elif self.mod_name == 'KNN':
            self.model = self.m4
        print(self.model)
        
    
    def split(self,ts,rs,tar_var):       # ts=test_size,rs=random_state,tar_var = target_variable
        self.ts,self.rs = ts,rs
        self.tar_var = tar_var
        self.x = self.df.drop(self.tar_var,axis=1)
        self.y = self.df[self.tar_var]
        self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.x,self.y,
                                                                             test_size=self.ts,
                                                                            random_state=self.rs)
        print('x_train shape',self.x_train.shape)
        print('x_test shape',self.x_test.shape)
        print('y_train shape',self.y_train.shape)
        print('y_test shape',self.y_test.shape)
        
    
    def fit(self):
        self.model.fit(self.x_train,self.y_train)
        
    def predict_and_metrics(self):
        self.ypred = self.model.predict(self.x_test)
        self.train_sc = self.model.score(self.x_train,self.y_train)
        self.test_sc = self.model.score(self.x_test,self.y_test)
        self.acc_sc = accuracy_score(self.y_test,self.ypred)
        self.cm = confusion_matrix(self.y_test,self.ypred)
        self.creport = classification_report(self.y_test,self.ypred)
        return self.ypred,self.train_sc,self.test_sc,self.acc_sc,self.cm,self.creport

        
a1 = Model('iris.csv')
a1.split(ts=0.25,rs=32,tar_var='label')  # ts = test_size,rs=random_state
a1.select_model('LogReg')
a1.fit()
ypred,train_score,test_score,acc_score,cm,creport = a1.predict_and_metrics()
print('Predictions\n',ypred)
print('Train_score',train_score)
print('Test_score',test_score)
print('Accuracy_Score',acc_score)
print('Confusion_Matrix\n',cm)
print('Classification_report\n',creport)

x_train shape (111, 4)
x_test shape (38, 4)
y_train shape (111,)
y_test shape (38,)
LogisticRegression(max_iter=1000)
Predictions
 ['Iris-versicolor' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica']
Train_score 0.990990990990991
Test_score 0.9473684210526315
Accuracy_Score 0.9473684210526315
Confusion_Matrix
 [[16  0  0]
 [ 0  9  2]
 [ 0  0 11]]
Classification_report
                  precision    recall  f1-score   support

    Iris-setosa      

In [19]:
a2 = Model('iris.csv')
a2.split(ts=0.25,rs=32,tar_var='label')  # ts = test_size,rs=random_state
a2.select_model('DT')
a2.fit()
ypred2,train_score2,test_score2,acc_score2,cm2,creport2 = a2.predict_and_metrics()
print('Predictions\n',ypred2)
print('Train_score',train_score2)
print('Test_score',test_score2)
print('Accuracy_Score',acc_score2)
print('Confusion_Matrix\n',cm2)
print('Classification_report\n',creport2)

x_train shape (111, 4)
x_test shape (38, 4)
y_train shape (111,)
y_test shape (38,)
DecisionTreeClassifier(max_depth=4, min_samples_split=15)
Predictions
 ['Iris-versicolor' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica']
Train_score 0.990990990990991
Test_score 0.9473684210526315
Accuracy_Score 0.9473684210526315
Confusion_Matrix
 [[16  0  0]
 [ 0  9  2]
 [ 0  0 11]]
Classification_report
                  precision    recall  f1-score   suppor

In [20]:
a3 = Model('iris.csv')
a3.split(ts=0.25,rs=32,tar_var='label')  # ts = test_size,rs=random_state
a3.select_model('RF')
a3.fit()
ypred3,train_score3,test_score3,acc_score3,cm3,creport3 = a3.predict_and_metrics()
print('Predictions\n',ypred3)
print('Train_score',train_score3)
print('Test_score',test_score3)
print('Accuracy_Score',acc_score3)
print('Confusion_Matrix\n',cm3)
print('Classification_report\n',creport3)

x_train shape (111, 4)
x_test shape (38, 4)
y_train shape (111,)
y_test shape (38,)
RandomForestClassifier(max_depth=4, min_samples_split=15, n_estimators=80)
Predictions
 ['Iris-versicolor' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica']
Train_score 0.990990990990991
Test_score 0.9473684210526315
Accuracy_Score 0.9473684210526315
Confusion_Matrix
 [[16  0  0]
 [ 0  9  2]
 [ 0  0 11]]
Classification_report
                  precision    recall  

In [21]:
a4 = Model('iris.csv')
a4.split(ts=0.25,rs=32,tar_var='label')  # ts = test_size,rs=random_state
a4.select_model('KNN')
a4.fit()
ypred4,train_score4,test_score4,acc_score4,cm4,creport4 = a4.predict_and_metrics()
print('Predictions\n',ypred4)
print('Train_score',train_score4)
print('Test_score',test_score4)
print('Accuracy_Score',acc_score4)
print('Confusion_Matrix\n',cm4)
print('Classification_report\n',creport4)

x_train shape (111, 4)
x_test shape (38, 4)
y_train shape (111,)
y_test shape (38,)
KNeighborsClassifier(n_neighbors=7)
Predictions
 ['Iris-versicolor' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica']
Train_score 0.972972972972973
Test_score 0.9473684210526315
Accuracy_Score 0.9473684210526315
Confusion_Matrix
 [[16  0  0]
 [ 0  9  2]
 [ 0  0 11]]
Classification_report
                  precision    recall  f1-score   support

    Iris-setosa    

#### Modification

In [None]:
# null value handle
# col drop
# labelencoding

In [37]:
class Model:
    def __init__(self,data):                      # constructor
        self.df = pd.read_csv(data)               # self.df = instance varible 
#         print(self.df.head())
        self.m1 = LogisticRegression(max_iter=1000)
        self.m2 = DecisionTreeClassifier(criterion='gini',max_depth=4,min_samples_split=15)
        self.m3 = RandomForestClassifier(n_estimators=80,criterion='gini',max_depth=4,
                                         min_samples_split=15)
        self.m4 = KNeighborsClassifier(n_neighbors=7)
        
    def select_model(self,model):
        self.mod_name = model                          # self.mod_name = instance_variable
        if self.mod_name == 'LogReg':
            self.model = self.m1                       # self.model = instance variable
        elif self.mod_name == 'DT':
            self.model = self.m2
        elif self.mod_name == 'RF':
            self.model = self.m3
        elif self.mod_name == 'KNN':
            self.model = self.m4
        print(self.model)
        
    def handle_null(self):
        self.cat_nan = [i for i in self.df.columns if self.df[i].isnull().sum()>0 and self.df[i].dtypes=='object']
        self.num_nan = [i for i in self.df.columns if self.df[i].isnull().sum()>0 and self.df[i].dtypes!='object']
#         print(self.df.isnull().sum())
#         print('Cat_nan',self.cat_nan)
#         print('Num_nan',self.num_nan)
        for i in self.cat_nan:
            z1 = self.df[i].mode().max()
            self.df[i].fillna(z1,inplace=True)
        for i in self.num_nan:
            z2 = self.df[i].mean()
            self.df[i].fillna(z2,inplace=True)
#         print(self.df.isnull().sum())
        
    def drop_cols(self,*col_list):
        self.clist =  col_list
        self.df.drop(list(self.clist),axis=1,inplace=True)
#         print(self.df.head())
    
    def cat_encode(self):
        self.cat_cols = [i for i in self.df.columns if self.df[i].dtypes=='object']
        print('Cat_cols',self.cat_cols)
        self.lb = LabelEncoder()
        for i in self.cat_cols:
            self.df[i] = self.lb.fit_transform(self.df[i])
#         print(self.df.dtypes.value_counts())
    
    def split(self,ts,rs,tar_var):       # ts=test_size,rs=random_state,tar_var = target_variable
        self.ts,self.rs = ts,rs
        self.tar_var = tar_var
        self.x = self.df.drop(self.tar_var,axis=1)
        self.y = self.df[self.tar_var]
        self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.x,self.y,
                                                                             test_size=self.ts,
                                                                            random_state=self.rs)
        print('x_train shape',self.x_train.shape)
        print('x_test shape',self.x_test.shape)
        print('y_train shape',self.y_train.shape)
        print('y_test shape',self.y_test.shape)
        
    
    def fit(self):
        self.model.fit(self.x_train,self.y_train)
        
    def predict_and_metrics(self):
        self.ypred = self.model.predict(self.x_test)
        self.train_sc = self.model.score(self.x_train,self.y_train)
        self.test_sc = self.model.score(self.x_test,self.y_test)
        self.acc_sc = accuracy_score(self.y_test,self.ypred)
        self.cm = confusion_matrix(self.y_test,self.ypred)
        self.creport = classification_report(self.y_test,self.ypred)
        return self.ypred,self.train_sc,self.test_sc,self.acc_sc,self.cm,self.creport

        
a1 = Model('titanic_train.csv')
a1.drop_cols('PassengerId','Name','Ticket','Cabin')
a1.handle_null()
a1.cat_encode()
a1.split(ts=0.25,rs=32,tar_var='Survived')  # ts = test_size,rs=random_state
a1.select_model('LogReg')
a1.fit()
ypred,train_score,test_score,acc_score,cm,creport = a1.predict_and_metrics()
print('Predictions\n',ypred)
print('Train_score',train_score)
print('Test_score',test_score)
print('Accuracy_Score',acc_score)
print('Confusion_Matrix\n',cm)
print('Classification_report\n',creport)

Cat_cols ['Sex', 'Embarked']
x_train shape (668, 7)
x_test shape (223, 7)
y_train shape (668,)
y_test shape (223,)
LogisticRegression(max_iter=1000)
Predictions
 [0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 1
 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0
 0 0 0 0 0 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 1
 0 1 1 0 1 0 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 0
 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 0 1 0 1 0 0
 1]
Train_score 0.8158682634730539
Test_score 0.7757847533632287
Accuracy_Score 0.7757847533632287
Confusion_Matrix
 [[113  20]
 [ 30  60]]
Classification_report
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       133
           1       0.75      0.67      0.71        90

    accuracy                           0.78       223
   macr

In [32]:
df =pd.DataFrame({'x1':[1,2,3,4,5],
                 'x2':[10,11,12,13,14],
                 'x3':[5,6,7,89,9]})
# df.head()

def dropc(*w):
    print(type(w))
    df.drop(list(w),axis=1,inplace=True) 
    print(df.head())
    
dropc('x1','x2')

<class 'tuple'>
   x3
0   5
1   6
2   7
3  89
4   9
