In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
df = pd.read_csv("Dataset_preprocessing.csv")

In [3]:
df

Unnamed: 0,id,Country,Age,Salary,Purchased,useless_col,almost_empty
0,0,France,44.0,72000,No,useless,
1,1,Spain,27.0,48000,Yes,useless,40.0
2,2,Germany,30.0,54000,No,useless,
3,3,Spain,38.0,61000,No,useless,20.0
4,4,Germany,40.0,69000,Yes,useless,
5,5,France,35.0,58000,Yes,useless,
6,6,Spain,,52000,No,useless,
7,7,France,48.0,79000,Yes,useless,
8,8,Germany,50.0,83000,No,useless,
9,9,France,37.0,67000,Yes,useless,


In [4]:
df.shape[0] 

12

In [5]:
df.describe(include='all')

Unnamed: 0,id,Country,Age,Salary,Purchased,useless_col,almost_empty
count,12.0,12,11.0,12.0,12,12,2.0
unique,,3,,,2,1,
top,,France,,,Yes,useless,
freq,,5,,,7,12,
mean,5.5,,36.909091,83389580.0,,,30.0
std,3.605551,,19.002392,288657400.0,,,14.142136
min,0.0,,-10.0,32000.0,,,20.0
25%,2.75,,32.5,53500.0,,,25.0
50%,5.5,,38.0,64000.0,,,30.0
75%,8.25,,46.0,73750.0,,,35.0


In [6]:
display(100*df.isnull().sum()/df.shape[0])

id               0.000000
Country          0.000000
Age              8.333333
Salary           0.000000
Purchased        0.000000
useless_col      0.000000
almost_empty    83.333333
dtype: float64

In [7]:
useless_cols = ['id', 'useless_col', 'almost_empty']

In [10]:
df = df.drop(useless_cols, axis=1)

In [11]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000,No
1,Spain,27.0,48000,Yes
2,Germany,30.0,54000,No
3,Spain,38.0,61000,No
4,Germany,40.0,69000,Yes
5,France,35.0,58000,Yes
6,Spain,,52000,No
7,France,48.0,79000,Yes
8,Germany,50.0,83000,No
9,France,37.0,67000,Yes


In [15]:
to_keep = (df['Age'] > 0) | (df['Age'].isnull())
df = df.loc[to_keep,:]

In [16]:
to_keep = df['Salary'] < df['Salary'].mean() + 2*df['Salary'].std()
df = df.loc[to_keep,:]

In [17]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000,No
1,Spain,27.0,48000,Yes
2,Germany,30.0,54000,No
3,Spain,38.0,61000,No
4,Germany,40.0,69000,Yes
5,France,35.0,58000,Yes
6,Spain,,52000,No
7,France,48.0,79000,Yes
8,Germany,50.0,83000,No
9,France,37.0,67000,Yes


In [21]:
target_name = 'Purchased'
y = df.loc[:,target_name]
x = df.drop(target_name, axis = 1)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [23]:
numeric_features = ['Age', 'Salary'] # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

In [24]:
categorical_features = ['Country'] # Names of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [26]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [28]:
print(x_train[0:5])
print("#######")
print(x_test[0:5,:])

[[ 0.27978024  0.58858382  1.          0.        ]
 [-0.23673712  0.38385901  0.          0.        ]
 [-1.95846165 -1.56102665  0.          1.        ]
 [-0.06456467 -1.15157703  0.          1.        ]
 [ 1.65715986  1.61220785  0.          0.        ]]
#######
[[-1.44194429 -0.94685223  1.          0.        ]
 [ 2.00150476  2.02165746  1.          0.        ]]


In [29]:
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
y_test = labelencoder.transform(y_test)

In [30]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [31]:
y_train_pred = model.predict(x_train)

In [32]:
y_test_pred = model.predict(x_test)

In [33]:
print("Accuracy on training set : ", accuracy_score(y_train, y_train_pred)) 
print("Accuracy on test set : ", accuracy_score(y_test, y_test_pred))

Accuracy on training set :  0.75
Accuracy on test set :  0.0


In [34]:
print("Accuracy on training set : ", model.score(x_train, y_train)) 
print("Accuracy on test set : ", model.score(x_test, y_test))

Accuracy on training set :  0.75
Accuracy on test set :  0.0
