In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


microsoft_data=pd.read_csv('Microsoft_Data.csv')

y=microsoft_data['HasDetections']

microsoft_data.drop(columns='HasDetections',inplace=True)

missing_value_percentage_df=pd.DataFrame({'Column_Name':microsoft_data.columns,
                                          'Missing_value_count':microsoft_data.isnull().sum(),
                                          'Missing_value_percentage':microsoft_data.isnull().sum()/microsoft_data.shape[0]*100})

missing_value_percentage_df=missing_value_percentage_df.sort_values(by='Missing_value_percentage',ascending=False)

#missing_value_percentage_df


In [2]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

le.fit_transform(microsoft_data['Processor'])

array([1, 1, 1, ..., 1, 2, 1], dtype=int64)

In [2]:
cols_to_drop=[]

cols_with_na_grt_50=list(missing_value_percentage_df[missing_value_percentage_df['Missing_value_percentage']>50].index)

for col in microsoft_data.columns:
    if microsoft_data[col].nunique()==microsoft_data.shape[0] or microsoft_data[col].nunique()==1:
        cols_to_drop.append(col)
        

columns_to_drop=cols_to_drop+cols_with_na_grt_50

microsoft_data.drop(columns=columns_to_drop,inplace=True)


In [3]:
for col in microsoft_data.columns:
    if col!='Census_InternalPrimaryDiagonalDisplaySizeInInches':
        microsoft_data[col]=microsoft_data[col].astype(object)
        
microsoft_data['Census_InternalPrimaryDiagonalDisplaySizeInInches'].dtype


dtype('float64')

In [4]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(microsoft_data,y,test_size=0.2,random_state=43)

# Filling missing values

for col in X_train.columns:
    if X_train[col].dtype=='object':
        X_train[col]=X_train[col].fillna(X_train[col].mode()[0]).astype(object)
        X_test[col]=X_test[col].fillna(X_train[col].mode()[0]).astype(object)
        
    else:
        X_train[col]=X_train[col].fillna(X_train[col].mean())
        X_test[col]=X_test[col].fillna(X_train[col].mean())

  

In [34]:
cat_cols= [col for col in X_train.columns if X_train[col].dtype=='object']

one_hot_train=pd.get_dummies(X_train[cat_cols])

#one_hot_train

In [15]:
one_hot_train.shape

(45404, 63004)

In [16]:
one_hot_test=pd.get_dummies(X_test[cat_cols])

In [17]:
one_hot_test.shape

(11352, 25672)

In [30]:
X_train_final,X_test_final=one_hot_train.align(one_hot_test,join='inner',axis=1,fill_value=0)



In [31]:
X_train_final['Census_InternalPrimaryDiagonalDisplaySizeInInches']=X_train['Census_InternalPrimaryDiagonalDisplaySizeInInches']
X_test_final['Census_InternalPrimaryDiagonalDisplaySizeInInches']=X_test['Census_InternalPrimaryDiagonalDisplaySizeInInches']

In [34]:
X_train_final.shape
X_test_final.shape

(11352, 15952)

In [35]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()
logreg.fit(X_train_final,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
logreg_pred=logreg.predict(X_test_final)

In [38]:
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score
confusion_matrix(y_test,logreg_pred)

precision_score(y_test,logreg_pred)
recall_score(y_test,logreg_pred)
f1_score(y_test,logreg_pred)

0.6062729980725425

# Examples to demonstrate one hot encoding

In [29]:


train=['Hyd','Chennai','Hyd','Bangalore','Bangalore','Pune','Chennai','Hyd','Pune','Amsterdam']

test=['Hyd','Chennai','Hyd','Ahmedabad','Pune','Chennai','Hyd','Bangalore']

one_hot_train=pd.get_dummies(train)


In [30]:
one_hot_test=pd.get_dummies(test)

In [32]:
train_final,test_final=one_hot_train.align(one_hot_test,join='inner',fill_value=0)

train_final

Unnamed: 0,Ahmedabad,Amsterdam,Bangalore,Chennai,Hyd,Pune
0,0,0,0,0,1,0
1,0,0,0,1,0,0
2,0,0,0,0,1,0
3,0,0,1,0,0,0
4,0,0,1,0,0,0
5,0,0,0,0,0,1
6,0,0,0,1,0,0
7,0,0,0,0,1,0
8,0,0,0,0,0,1
9,0,1,0,0,0,0


In [33]:
test_final

Unnamed: 0,Ahmedabad,Amsterdam,Bangalore,Chennai,Hyd,Pune
0,0,0,0,0,1,0
1,0,0,0,1,0,0
2,0,0,0,0,1,0
3,1,0,0,0,0,0
4,0,0,0,0,0,1
5,0,0,0,1,0,0
6,0,0,0,0,1,0
7,0,0,1,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [26]:
from sklearn.preprocessing import LabelEncoder
import numpy as np


class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [27]:
le=LabelEncoderExt()

for col in cat_cols:
    le.fit(X_train[col])
    X_train[col]=le.transform(X_train[col])
    X_test[col]=le.transform(X_test[col])