In [None]:
import pandas as pd

df_house = pd.read_csv('data/AmesHousing.csv')

df_house.info()

In [None]:
df_house.isna().sum.sort_values(ascending=False)

In [None]:
na_series = df_house.isna().sum()
data_len = len(df_house) * 0.05
na_series[(na_series < data_len) & (na_series != 0)]

In [None]:
col_names = list(na_series[(na_series <= data_len) & (na_series != 0)].keys())

In [None]:
df_house = df_house.dropna(subset=col_names)
df_house.shape

In [None]:
object_cols=list(df_house.select_dtypes(include='object').columns)

In [None]:
X_cat=df_house[object_cols]
X_cat

In [None]:
X_nums = df_house.drop(object_cols, axis=1)
X_nums

In [None]:
y = X_nums['SalePrice'].values.reshape(-1, 1)
X_nums.drop('SalePrice', inplace=True, axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train_cat, X_test_cat, y_train_cat, y_test_cat =train_test_split(X_cat, y, test_size=0.2, random_state=42)
X_train_nums, X_test_nums, y_train_nums, y_test_nums =train_test_split(X_nums, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.impute import SimpleImputer

imp_cat=SimpleImputer(strategy='most_frequent')
X_train_cat=imp_cat.fit_transform(X_train_cat)
X_train_cat=imp_cat.fit_transform(X_train_cat)

In [None]:
imp_num=SimpleImputer()
X_train_nums=imp_num.fit_transform(X_train_nums)
X_test_nums=imp_num.fit_transform(X_test_nums)

In [None]:
import numpy as np
X_train=np.append(X_train_nums,X_train_cat,axis=1)
X_train

In [None]:
X_test=np.append(X_test_nums,X_test_cat,axis=1)
X_test

In [None]:
from sklearn.pipeline import Pipeline
df_music = pd.read_csv('data/music_clean.csv')
df_music = df_music.dropna(subset=['genre', 'popularity', 'loudness', 'liveness', 'tempo'])
df_music['genre'] = np.where(df_music['genre']=='Rock', 1, 0)

In [None]:
X = df_music.drop('genre', axis=1).values
y = df_music['genre'].values

In [None]:
from sklearn.linear_model import LogisticRegression
steps = [
    ('imputation', SimpleImputer()),
    ('logistic_regression', LogisticRegression())
]

pipeline = Pipeline(steps=steps)

X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
import pandas as pd
df_music=pd.read_csv('data/music_genre.csv')
df_music.describe().T

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np 

X = df_music.drop('music_genre', axis=1).values
y = df_music['music_genre'].values

X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

print(np.mean(X), np.std(X))

print(np.mean(X_train_scaled), np.std(X_train_scaled))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

steps = [
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=6))
]

pipeline = Pipeline(steps)

knn_scaled = pipeline.fit(X_train, y_train)
y_pred = knn_scaled.predict(X_test)

knn_scaled.score(X_test, y_test)

In [None]:
knn_unscaled = KNeighborsClassifier(n_neighbors=6).fit(X_train, y_train)
knn_unscaled.score(X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

steps = [
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
]

pipeline = Pipeline(steps)

parameters = {"knn__n_neighbors" : np.arange(1, 50)}

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)

(cv.best_score_, cv.best_params_)

In [None]:
# Sınıflandırma modellerinin değerlendirilmesi

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

df_music = pd.read_csv("data/music_clean.csv")

X = df_music.drop('genre', axis=1).values
y = df_music['genre'].values

X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

models = {
    "Logistic Regression" : LogisticRegression(), 
    "KNN": KNeighborsClassifier(), 
    "Decision Tree": DecisionTreeClassifier() 
}

results = []

for model in models.values():
    kf = KFold(n_splits=6, random_state=True, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
    results.append(cv_results)

plt.boxplot(results, labels=models.keys())

In [None]:
# test performance

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    print(f"{name} Test Set Accuracy : {test_score} ")