# Logistic Regression Demo - Large Dataset

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score
from sklearn.impute import SimpleImputer


%matplotlib inline
os.getcwd()

## Import Dataset

In [60]:
cosmetic=pd.read_csv('cosmeticdata.csv')
cosmetic=cosmetic.sample(10000)
cosmetic['ChemicalCount'].value_counts().unique()

array([7689, 1838,  280,  117,   66,    7,    2,    1], dtype=int64)

## Target variable is imbalanced - Apply SMOTE

In [None]:

sm = SMOTE(random_state=589)
X_SMOTE, y_SMOTE = sm.fit_sample(X_train, y_train)
print(len(y_SMOTE))

## Exploratory Data Analysis

In [None]:
cosmetic.info()

In [None]:
cosmetic.describe()

In [23]:
null_value_cols=cosmetic.isnull().any()
null_value_cols=cosmetic.columns[null_value_cols]
null_value_cols

Index(['CSFId', 'CSF', 'BrandName', 'CasNumber', 'DiscontinuedDate',
       'ChemicalDateRemoved'],
      dtype='object')

In [24]:
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
#cosmetic[['CSFId','CSF']]=imputer.fit_transform(cosmetic[['CSFId','CSF']])
cosmetic[['CSFId', 'CSF', 'BrandName', 'CasNumber', 'DiscontinuedDate','ChemicalDateRemoved']]=imputer.fit_transform(cosmetic[['CSFId', 'CSF', 'BrandName', 'CasNumber', 'DiscontinuedDate','ChemicalDateRemoved']])

In [25]:

cosmetic.loc[:,null_value_cols]=imputer.fit_transform(cosmetic.loc[:,null_value_cols])
#cosmetic.isnull().sum()

In [None]:
sns.boxplot(x='sex',y='charges',hue='region',data=insurance)

In [None]:
sns.regplot(x='age',y='charges',data=insurance)
plt.show()

In [None]:
fig=sns.FacetGrid(insurance,col='sex')
fig.map(plt.hist,'charges')

In [None]:
insurance.isnull().sum()

In [None]:
insurance.plot('age',kind='hist')

In [None]:
insurance.plot('charges',kind='hist')

## Convert Categorical Columns to Numeric

### Get Dummies 

In [32]:
processed_data=pd.get_dummies(cosmetic)

### Label Encoder

In [None]:
le=LabelEncoder()
qualitative=[c for c in insurance.columns if insurance.dtypes[c]=='object']
le_sex=LabelEncoder()
le_smoker=LabelEncoder()
le_region=LabelEncoder()
insurance[qualitative[0]]=le_sex.fit_transform(insurance[qualitative[0]])
insurance[qualitative[1]]=le_sex.fit_transform(insurance[qualitative[1]])
insurance[qualitative[2]]=le_sex.fit_transform(insurance[qualitative[2]])
insurance.head()

In [34]:
processed_data.head()

Unnamed: 0,CDPHId,CSFId,CompanyId,PrimaryCategoryId,SubCategoryId,CasId,ChemicalId,ChemicalCount,"ProductName_""DIOR PRESTIGE LE MICRO-FLUIDE TEINT DE ROSE Micro-Nutritive Skincare Foundation Revitalizing and Illuminating","ProductName_""Hello Flawless!"" SPF 15",...,ChemicalDateRemoved_12/05/2103,ChemicalDateRemoved_12/08/2009,ChemicalDateRemoved_12/09/2011,ChemicalDateRemoved_12/14/2009,ChemicalDateRemoved_12/16/2009,ChemicalDateRemoved_12/28/2018,ChemicalDateRemoved_12/30/2013,ChemicalDateRemoved_12/31/2009,ChemicalDateRemoved_12/31/2016,ChemicalDateRemoved_12/31/2017
76040,27368,54453.0,784,44,58,656,44311,1,0,0,...,0,0,0,0,0,0,0,0,0,0
28816,8777,54453.0,396,59,65,656,13675,1,0,0,...,0,0,0,0,1,0,0,0,0,0
102998,37071,59531.0,23,59,65,656,60903,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12940,3070,5990.0,67,44,52,656,5125,1,0,0,...,0,0,0,0,0,0,0,0,0,0
64681,23314,54453.0,184,18,25,656,35549,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split the target and predictors to y and x variables

In [35]:
y=processed_data['ChemicalCount']
x=processed_data.drop('ChemicalCount',axis=1)

### Scaler  - Standard Scaler used here to scale the data

In [36]:
scaler=StandardScaler()
x=scaler.fit_transform(processed_data)

In [37]:
len(x),len(y)

(5000, 5000)

### Split the processed dataset into train and test datasets

In [38]:

xtrain,xtest,ytrain,ytest=train_test_split(x,y,random_state=1,test_size=0.30)

In [39]:
xtrain.shape,ytrain.shape,xtest.shape,ytest.shape

((3500, 12943), (3500,), (1500, 12943), (1500,))

## Instantiate Linear Regression Model

In [40]:
lm=LogisticRegression()

### Create the Linear Regression Model

In [41]:
model=lm.fit(xtrain,ytrain)




## Predict the model with test dataset

In [42]:
result=model.predict(xtest)

## Check Accuracy for the model

In [None]:
### Graph not working fine. 

cm=confusion_matrix(ytest,result)
print(accuracy_score(ytest,result))
fig,ax=plt.subplots(figsize=(10,5))
ax.imshow(cm)
ax.grid(False)
ax.set_xlabel('Predicted outputs', fontsize=2, color='black')
ax.set_ylabel('Actual outputs', fontsize=2, color='black')
ax.xaxis.set(ticks=range(10))
ax.yaxis.set(ticks=range(10))

In [None]:
df=pd.DataFrame({'Actual':ytest,'Predicted':result})



In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(ytest, result)
fpr, tpr, thresholds = roc_curve(ytest, model.predict_proba(xtest)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

## Without Scaler

In [None]:
test=processed_data['charges']
train=processed_data.drop('charges',axis=1)

In [None]:
xtrain1,xtest1,ytrain1,ytest1=train_test_split(train,test,test_size=0.3,random_state=3)

In [None]:
xtrain1.shape,ytrain1.shape

In [None]:
model1=lm.fit(xtrain1,ytrain1)

In [None]:
result1=model.predict(xtest1)

In [None]:
mse1=mean_squared_error(result1, ytest1)
rmse1 = np.sqrt(mse1)
print(rmse1)
print(r2_score(ytest1,result1))

In [None]:
df=pd.DataFrame({'Actual':ytest,'Predicted':result})

df1=pd.DataFrame({'Actual1':ytest1,'Predicted1':result1})
sns.lmplot(x='Actual',y='Predicted',data=df)
sns.lmplot(x='Actual1',y='Predicted1',data=df1)