In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
dataset = pd.read_csv("dataset.csv")
dataset

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


### Numero istanze

In [None]:
dataset.shape[0]

2000

### Istanze nulle

In [None]:
dataset.isnull().sum()

Unnamed: 0,0
battery_power,0
blue,0
clock_speed,0
dual_sim,0
fc,0
four_g,0
int_memory,0
m_dep,0
mobile_wt,0
n_cores,0


### Bilanciamento

In [None]:
dataset["price_range"].value_counts()

Unnamed: 0_level_0,count
price_range,Unnamed: 1_level_1
1,500
2,500
3,500
0,500


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [None]:
dataset.dtypes

Unnamed: 0,0
battery_power,int64
blue,int64
clock_speed,float64
dual_sim,int64
fc,int64
four_g,int64
int_memory,int64
m_dep,float64
mobile_wt,int64
n_cores,int64


In [None]:
dataset.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [None]:
Y = dataset["price_range"]
X = dataset.drop("price_range", axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/4, stratify=Y, random_state=123)

In [None]:
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer

In [None]:
X_train.shape

(1500, 20)

In [None]:
coltran = ColumnTransformer(transformers=[("discr", StandardScaler(), ["mobile_wt"])], remainder='passthrough')

coltran.fit_transform(X_train).shape

(1500, 20)

## Punto 2.5


In [None]:
coltran = ColumnTransformer(transformers=[("std", StandardScaler(), ["int_memory", "ram", "talk_time"]),
                                 ("discr", KBinsDiscretizer(n_bins=5), ["mobile_wt", "battery_power"])],
                                 remainder='passthrough')

pipeline = Pipeline(steps=[("coltran", coltran),
                           ("estimator", DecisionTreeClassifier(random_state=123))])

pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)
print("Accuracy: " + str(accuracy_score(Y_test, Y_pred)))
confusion_matrix(Y_test, Y_pred)

Accuracy: 0.788


array([[108,  17,   0,   0],
       [ 15,  93,  17,   0],
       [  0,  22,  88,  15],
       [  0,   0,  20, 105]])

In [None]:
pipeline

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Punto 2.6

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

parameters = {
    'coltran__discr__n_bins': [10, 20, 30, 40, 50],
    'selectkbest__k': [1, 5, 10, "all"],
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

extended_pipeline = Pipeline(steps=[("coltran", coltran),
                                    ("selectkbest", SelectKBest()),
                                    ("estimator", DecisionTreeClassifier(random_state=123))
                                   ])

gd = GridSearchCV(extended_pipeline, parameters)
gd.fit(X_train, Y_train)
Y_pred = gd.predict(X_test)
gd.best_params_

  _data = np.array(data, dtype=dtype, copy=copy,


{'coltran__discr__n_bins': 10,
 'estimator__criterion': 'gini',
 'estimator__min_samples_leaf': 1,
 'selectkbest__k': 10}

In [None]:
gd.best_score_

0.8219999999999998

In [None]:
gd

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Punto 2.7


In [None]:
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
best_criterion = gd.best_params_['estimator__criterion']
best_min_samples_leaf = gd.best_params_['estimator__min_samples_leaf']

In [None]:
def identity_func(X):
  return X

parameters = {
    'combined_features__svd__n_components': [2, 4, 6]
}

combined_features = FeatureUnion([("svd", TruncatedSVD()), ('passthrough', FunctionTransformer(identity_func,validate=False))])

my_pipeline = Pipeline(steps=[("coltran", coltran),
                              ('combined_features', combined_features),
                              ("estimator", DecisionTreeClassifier(criterion=best_criterion, min_samples_leaf=best_min_samples_leaf))
                             ], verbose = True)

gd = GridSearchCV(my_pipeline, parameters)
gd.fit(X_train, Y_train)
Y_pred = gd.predict(X_test)
print("Accuracy: " + str(accuracy_score(Y_test, Y_pred)))
confusion_matrix(Y_test, Y_pred)
gd.best_params_

[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.1s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.1s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.1s
[Pipel

{'combined_features__svd__n_components': 2}

In [None]:
gd

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).

