In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import f_classif, SelectFromModel, SelectKBest, VarianceThreshold, chi2
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('mobile_price_range_data.csv')

display(df.iloc[::, -1].value_counts())

X = df.iloc[::, :-1]
Y = df.iloc[::, -1]

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [17]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

display(f'x_test - shape:  {x_test.shape} y_test: {y_test.shape}')
display(f'x_train - shape:  {x_train.shape} y_train: {y_train.shape}')

'x_test - shape:  (600, 20) y_test: (600,)'

'x_train - shape:  (1400, 20) y_train: (1400,)'

In [19]:
import warnings 
warnings.filterwarnings('ignore')

model = LogisticRegression()
model.fit(x_train, y_train)

In [20]:
print(model.score(x_test, y_test))

0.6283333333333333


------------------------

In [21]:
#### Scaling 

MI_MA = MinMaxScaler()
X_s = MI_MA.fit_transform(X)

print(X_s[0].mean())
print(X_s[0].max())
print(X_s[0].min())
print(X_s.shape)

0.3076199247106047
1.0
0.0
(2000, 20)


In [22]:
x_train, x_test, y_train, y_test = train_test_split(X_s, Y, test_size=0.3, random_state=1)

In [23]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [24]:
print(model.score(x_test, y_test))

0.8883333333333333


------

In [25]:
#### Standardize

ST = StandardScaler()
X_St = ST.fit_transform(X)

print(X_St.mean())
print(X_St.max())
print(X_St.min())
print(X_St.shape)

1.84297022087776e-17
3.3846284452337403
-1.7868609675630667
(2000, 20)


In [26]:
x_train, x_test, y_train, y_test = train_test_split(X_St, Y, test_size=0.3, random_state=1) 

In [27]:
model_st = LogisticRegression()
model_st.fit(x_train, y_train)

In [28]:
print(model_st.score(x_test, y_test))

0.935


---------------

In [29]:
### SelectKbest

model_Kbest = SelectKBest(score_func=f_classif, k=10)
model_Kbest.fit(X, Y)

In [30]:
X_k_best = model_Kbest.fit_transform(X, Y)

ind = model_Kbest.get_support(indices=True)

DF_Kbest = pd.DataFrame({'feature': X.columns, 'score': model_Kbest.scores_})
DF_Kbest.nlargest(10, 'score')

Unnamed: 0,feature,score
13,ram,3520.110824
0,battery_power,31.598158
12,px_width,22.620882
11,px_height,19.484842
8,mobile_wt,3.594318
6,int_memory,2.922996
9,n_cores,2.625415
14,sc_h,2.225984
15,sc_w,1.671
16,talk_time,1.628811


In [31]:
X_k_best.shape

(2000, 10)

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X_k_best, Y, test_size=0.3, random_state=1) 

In [33]:
model_K = LogisticRegression()
model_K.fit(x_train, y_train)

In [34]:
print(model_K.score(X_k_best, Y))

0.636


-------------

In [35]:
model_dec = DecisionTreeClassifier(max_depth=4)
model_dec.fit(x_train, y_train)

In [36]:
model_dec.score(x_test, y_test)

0.8266666666666667

---------------------------------

In [37]:
model_Kbest_2 = SelectKBest(score_func=f_classif, k=10)
model_Kbest_2.fit(X_St, Y)

In [38]:
X_ST_KB = model_Kbest_2.fit_transform(X_St, Y)

In [39]:
x_train, x_test, y_train, y_test = train_test_split(X_ST_KB, Y, test_size=0.3, random_state=1) 

In [40]:
model_Xstkb = LogisticRegression()
model_Xstkb.fit(x_train, y_train)

In [41]:
print(model_Xstkb.score(x_test, y_test))

0.96


In [42]:
X_ST_KB.shape

(2000, 10)

------------

In [43]:
##### Select from model

In [44]:
selcector = SelectFromModel(estimator=DecisionTreeClassifier()).fit(X, Y)

selcector.estimator_.feature_importances_

array([0.10094271, 0.        , 0.00398828, 0.003     , 0.00191111,
       0.00066667, 0.00716329, 0.00635287, 0.02061092, 0.0043873 ,
       0.00544553, 0.08902902, 0.09026927, 0.64794469, 0.00410982,
       0.00708446, 0.00620515, 0.        , 0.00088889, 0.        ])

In [45]:
X_select = selcector.fit_transform(X, Y)

X_select.shape

(2000, 4)

In [46]:
model_from_select = LogisticRegression()
model_from_select.fit(X_select, Y)

In [47]:
print(model_from_select.score(X_select, Y))

0.9595


---------------

In [49]:
#### Variance Threshold

v = np.var(X)

mean_v = v.values.mean()

In [50]:
model_variance = VarianceThreshold(threshold=mean_v)
X_vThreshodl = model_variance.fit_transform(X)

In [51]:
X_vThreshodl.shape

(2000, 4)

In [52]:
X_vThreshodl

array([[ 842.,   20.,  756., 2549.],
       [1021.,  905., 1988., 2631.],
       [ 563., 1263., 1716., 2603.],
       ...,
       [1911.,  868., 1632., 3057.],
       [1512.,  336.,  670.,  869.],
       [ 510.,  483.,  754., 3919.]])

In [53]:
model_log = LogisticRegression()
model_log.fit(X_vThreshodl, Y)

In [54]:
print(model_log.score(X_vThreshodl, Y))

0.9595


----

In [55]:
min_max = MinMaxScaler()
trans = min_max.fit_transform(X_vThreshodl)

In [56]:
model_log2 = LogisticRegression()
model_log2.fit(trans, Y)

In [57]:
print(model_log2.score(trans, Y))

0.95


-------------

In [58]:
##### PCA

model_PCA = PCA(n_components=4)
model_PCA.fit(X)

In [59]:
X_pca = model_PCA.fit_transform(X)

print('shape', X_pca.shape)
print('\n')
print('Explanet_variance:', sum(model_PCA.explained_variance_ratio_))
print('\n')

shape (2000, 4)


Explanet_variance: 0.9990250963312742




In [60]:
model_log3 = LogisticRegression(multi_class='ovr')
model_log3.fit(X_pca, Y)

In [61]:
print(model_log3.score(X_pca, Y))

0.8665


--------------

In [62]:
#### LDA

model_LDA = LinearDiscriminantAnalysis(n_components=2)
model_LDA.fit(x_train, y_train)

In [63]:
x_transform = model_LDA.fit_transform(x_test, y_test)

In [64]:
x_transform

array([[-4.66410968,  0.15832617],
       [-3.24531238,  0.46519068],
       [-1.82342634, -0.10893835],
       ...,
       [ 0.47902992, -0.5552167 ],
       [ 5.58456906,  0.1905178 ],
       [ 4.70142132, -1.54322068]])

In [65]:
model_LDA.score(x_test, y_test)

0.96

---------------

In [58]:
import numpy as np
from sklearn.datasets import make_classification
from scipy.stats import multivariate_normal

# Генерируем данные
X, y = make_classification(n_samples=200, n_features=2, n_classes=2,
                           n_informative=2, n_clusters_per_class=1
                           ,n_redundant=0, n_repeated=0, random_state=42)

# Вычисляем среднее для каждого класса
classes = np.unique(y)
mean_vectors = {c: np.mean(X[y == c], axis=0) for c in classes}

# Вычисляем общую ковариационную матрицу (так как она одинакова для всех классов)
S_W = np.cov(X.T)

# Инвертируем ковариационную матрицу
S_W_inv = np.linalg.inv(S_W)

# Вычисляем априорные вероятности классов
priors = {c: np.mean(y == c) for c in classes}

