In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge

from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt

import pingouin as pg

from scipy.stats import mode

In [None]:
df = pd.read_csv('osuDataset.csv')
print(df.shape)
df['mode'].value_counts()

In [None]:
df = df.query("mode == 'osu' and ranked == 1")
print(df.shape)
df['mode'].value_counts()

In [None]:
df.info(verbose=True, null_counts=True)

In [None]:
y = df['difficulty_rating']
x_ = df[['hit_length','cs','drain','ar','accuracy','bpm']]
x_.describe()

In [None]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x_)
pd.DataFrame(x).describe()

In [None]:
y.head(10)

In [None]:
plt.show()
plt.rcParams["figure.figsize"] = (15,5.5)

In [None]:
y_ = y.to_numpy()
plt.scatter(range(len(y_)), y_)#np.sort(y_))
plt.xticks(range(0, len(y_),int(len(y_)/20)))
plt.title("Ground Truth Diffuculty")

In [None]:
plt.hist(y_,bins=30)
plt.title("Difficulty Rate Histogram")

In [None]:
np.max(y_)

In [None]:
hit_lenght = x[:,0]
plt.xticks(range(0, len(hit_lenght),int(len(hit_lenght)/20)))
plt.scatter(range(len(hit_lenght)), x[:,0])
plt.title("hit length")
plt.show()

In [None]:
cs = x[:,1]
plt.xticks(range(0, len(cs),int(len(cs)/20)))
plt.scatter(range(len(cs)), cs)
plt.title("Circle Size")
plt.show()

In [None]:
drain = x[:,2]
plt.xticks(range(0, len(drain),int(len(drain)/20)))
plt.scatter(range(len(drain)), drain)
plt.title("drain")
plt.show()

In [None]:
ar = x[:,3]
plt.xticks(range(0, len(ar),int(len(ar)/20)))
plt.scatter(range(len(ar)), ar)
plt.title("Approach Rate")
plt.show()

In [None]:
acc = x[:,4]
plt.xticks(range(0, len(acc),int(len(acc)/20)))
plt.scatter(range(len(acc)), acc)
plt.title("accuracy")
plt.show()

In [None]:
bpm = x[:,5]
plt.xticks(range(0, len(bpm),int(len(bpm)/20)))
plt.scatter(range(len(bpm)), bpm)
plt.title("bpm")
plt.show()

# Relations

## Grupların Anova İle İncelenmesi

Grupları sınıflandırmak arasında anlamlı fark var mı F-testi ve ANOVA incelenmesi

In [None]:
plt.scatter( hit_lenght,y_)#.astype(int))

In [None]:
pd.DataFrame(np.c_[hit_lenght,y_.astype(int)],columns=["hit_lenght","difficulty"]).head()

In [None]:
anova1_data = pd.DataFrame(np.c_[hit_lenght,y_.astype(int)],columns=["hit_lenght","difficulty"])
pg.anova(data=anova1_data,dv='hit_lenght',between='difficulty')

Difficult değerleri arasında p değeri 0.05 den küçük olduğu için anlamlı fark vardır yani bu grupların veriyi etkilediği söylenebilir. Tüm gruplar kendi içinde etkilimi söylemek için t test uygulanır. Bunun için pairwise_tukey test kullanılır ve gruplar kendi içinde incelenir.

In [None]:
pg.pairwise_tukey(data=anova1_data,dv='hit_lenght',between='difficulty').head(50)

Tukey testinden uzak olan gruplar arasında daha anlamlı fark vardı bu ordinal veri olmasından dolayıdır. Uzak verilerden anlamlı bilgi çıkabilir.

In [None]:
plt.scatter( cs, y_ )
plt.title("Difficulty-Circle Size Relation")
plt.xlabel("Circle Size")
plt.ylabel("Difficulty")
plt.show()

In [None]:
anova2_data = pd.DataFrame(np.c_[cs,y_.astype(int)],columns=["circle size","difficulty"])
pg.anova(data=anova2_data, dv='circle size', between='difficulty')

Difficult değerleri arasında p değeri 0.05 den küçük olduğu için anlamlı fark vardır yani bu grupların veriyi etkilediği söylenebilir. Tüm gruplar kendi içinde etkilimi söylemek için t test uygulanır. Bunun için pairwise_tukey test kullanılır ve gruplar kendi içinde incelenir.

In [None]:
pg.pairwise_tukey(data=anova2_data,dv='circle size',between='difficulty')

In [None]:
plt.scatter( drain, y_ )
plt.title("Difficulty-Drain Relation")
plt.xlabel("Drain")
plt.ylabel("Difficulty")
plt.show()

In [None]:
anova3_data = pd.DataFrame(np.c_[drain, y_.astype(int)], columns=["drain","difficulty"])
pg.anova(data=anova3_data, dv='drain', between='difficulty')

In [None]:
pg.pairwise_tukey(data=anova3_data, dv='drain',between='difficulty')

# Anova Devamı

In [None]:
plt.scatter( ar, y_ )
plt.title("Difficulty-Approach Rate Relation")
plt.xlabel("Approach Rate")
plt.ylabel("Difficulty")
plt.show()

In [None]:
anova4_data = pd.DataFrame(np.c_[ar, y_.astype(int)],columns=["approach rate","difficulty"])
pg.anova(data=anova4_data, dv='approach rate', between='difficulty')

In [None]:
pg.pairwise_tukey(data=anova4_data, dv='approach rate',between='difficulty')

In [None]:
plt.scatter( bpm, y_ )
plt.title("Difficulty-bpm Relation")
plt.xlabel("bpm")
plt.ylabel("difficulty")
plt.show()

In [None]:
anova5_data = pd.DataFrame(np.c_[ar, y_.astype(int)],columns=["bpm","difficulty"])
pg.anova(data=anova5_data, dv='bpm', between='difficulty')

In [None]:
pg.pairwise_tukey(data=anova5_data, dv='bpm',between='difficulty')

In [None]:
plt.scatter( acc, y_ )
plt.title("Difficulty Drain Relation")
plt.xlabel("accuracy")
plt.ylabel("difficult")
plt.show()

In [None]:
anova6_data = pd.DataFrame(np.c_[ar, y_.astype(int)],columns=["accuracy","difficulty"])
pg.anova(data=anova6_data, dv='accuracy', between='difficulty')

In [None]:
pg.pairwise_tukey(data=anova6_data, dv='accuracy',between='difficulty')

One Way ANOVA incelemesi sonucu gereksiz verilerin atılmasına karar verilmiştir.

In [None]:
pd.DataFrame(x, columns=['hit_length','cs','drain','ar','accuracy','bpm']).boxplot()

12-11-10-9-0 zorluk değerlerinden anlamlı veri çıkmamaktadır bu yüzden yeterli veri toplanana kadar bu veri atılır.  

In [None]:
difficult_count = [np.count_nonzero(anova3_data['difficulty'] == i) for i in range(int(np.max(anova3_data['difficulty'])))]
plt.bar(range(len(difficult_count)),difficult_count,color ='maroon',width = 0.4)

In [None]:
difficult_count

# Gereksiz Verinin Atılması

In [None]:
dataset = np.c_[x,y]
reduced_ds=dataset[dataset[:,6] >= 1]
reduced_ds=reduced_ds[reduced_ds[:,6] < 9]
np.min(reduced_ds[:,6]), np.max(reduced_ds[:,6])

In [None]:
x = reduced_ds[:,:6]
y = reduced_ds[:,6]

In [None]:
anova_data = pd.DataFrame(np.c_[x[:,0], y.astype(int)],columns=["hit_length","difficulty"])
pg.anova(data=anova_data, dv='hit_length', between='difficulty')

In [None]:
pg.pairwise_tukey(data=anova_data, dv='hit_length',between='difficulty').head()

# Hazır Modeller İle Deneme

In [None]:
# _x,_y 0 12 arası zorluk için || x,y 0 8 arası zorluk için
models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(), BayesianRidge(), SVR()]
results = []

for model in models:
    results.append(cross_val_score(model, x, y, cv = 10).sum() / 10)

results

# 0 dan fonksiyonlar

KNN classification

In [None]:
#Euclidean Distance
def eucledian(p1,p2):
    dist = np.sqrt(np.sum((p1-p2)**2))
    return dist



#Function to calculate KNN
def predict(x_train, y , x_input, k):
    op_labels = []
     
    #Loop through the Datapoints to be classified
    for item in x_input: 
         
        #Array to store distances
        point_dist = []
         
        #Loop through each training Data
        for j in range(len(x_train)): 
            distances = eucledian(np.array(x_train[j,:]) , item) 
            #Calculating the distance
            point_dist.append(distances) 
        point_dist = np.array(point_dist) 
         
        #Sorting the array while preserving the index
        #Keeping the first K datapoints
        dist = np.argsort(point_dist)[:k] 
         
        #Labels of the K datapoints from above
        labels = y[dist]
         
        #Majority voting
        lab = mode(labels) 
        lab = lab.mode[0]
        op_labels.append(lab)
 
    return op_labels

In [None]:
x_test = x[int(0.75*len(x)):]
y_test = y[int(0.75*len(x)):].astype(int)
y_train =  y[:int(0.75*len(x))].astype(int)
x_train = x[:int(0.75*len(x))]
y_pred = predict(x_train, y_train, x_test, 7)

#Checking the accuracy
accuracy_score(y_test, y_pred)

KNN Regresyon

In [None]:
X_train = x_train
X_test = x_test

In [None]:
mu = np.mean(X_train, 0)
sigma = np.std(X_train, 0)

X_train = (X_train - mu ) / sigma

#We use the same mean and SD as the one of X_train as we dont know the mean of X_test
X_test = (X_test - mu ) / sigma

#Standardizing the y_train data
mu_y = np.mean(y_train, 0)
sigma_y = np.std(y_train, 0, ddof = 0)

y_train = (y_train - mu_y ) / sigma_y

In [None]:
# Vectorized approach to find the 
import time

start = time.process_time()

# We are setting a range of K values and calculating the RMSE for each of them. This way we can chose the optimal K value
k_list = [x for x in range(1,50,1)]

# Calculating the distance matrix using numpy broadcasting technique 
distance = np.sqrt(((X_train[:, :, None] - X_test[:, :, None].T) ** 2).sum(1))

#Sorting each data points of the distance matrix to reduce computational effort 
sorted_distance = np.argsort(distance, axis = 0)

#The knn function takes in the sorted distance and returns the RMSE of the 
def knn(X_train,X_test,y_train,y_test,sorted_distance,k):
    y_pred = np.zeros(y_test.shape)
    for row in range(len(X_test)):
        
        #Transforming the y_train values to adjust the scale. 
        y_pred[row] = y_train[sorted_distance[:,row][:k]].mean() * sigma_y + mu_y

    RMSE = np.sqrt(np.mean((y_test - y_pred)**2))
    return RMSE

#Storing the RMSE values in a list for each k value 
rmse_list = []
for i in k_list:
    rmse_list.append(knn(X_train,X_test,y_train,y_test,sorted_distance,i))
    
print(time.process_time() - start)


In [None]:
plt.plot(k_list, rmse_list)
plt.xlabel("K values")
plt.ylabel("RMSE")


In [None]:
#Finding the optimal K value
min_rmse_k_value = k_list[rmse_list.index(min(rmse_list))]

#Finding the lowest possible RMSE
optimal_RMSE = knn(X_train,X_test,y_train,y_test,sorted_distance,min_rmse_k_value)
optimal_RMSE / (np.max(y)-np.min(y))

In [None]:
min_rmse_k_value

Lineer Ağırlıklı Regresyon (Kendimiz Yazdık)

In [None]:
# Linear Regression

class LR():

    def __init__(self, learning_rate, iterations):

        self.learning_rate = learning_rate

        self.iterations = iterations

    # Function for model training

    def fit(self, X, Y):

        # no_of_training_examples, no_of_features

        self.m, self.n = X.shape

        # weight initialization

        self.W = np.zeros(self.n)

        self.b = 0

        self.X = X

        self.Y = Y

        # gradient descent learning

        for i in range(self.iterations):

            self.update_weights()

        return self

    # Helper function to update weights in gradient descent

    def update_weights(self):

        Y_pred = self.predict(self.X)

        # calculate gradients

        dW = - (2 * (self.X.T).dot(self.Y - Y_pred)) / self.m

        db = - 2 * np.sum(self.Y - Y_pred) / self.m

        # update weights

        self.W = self.W - self.learning_rate * dW

        self.b = self.b - self.learning_rate * db

        return self

    # Hypothetical function  h( x )

    def predict(self, X):

        return X.dot(self.W) + self.b


In [None]:
model = LR(iterations=1000, learning_rate=0.01)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

r2_score(y_test, y_pred)

In [None]:
kf = KFold(n_splits=10)

res = []
for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(x_train, y_train)
        
        y_pred = model.predict(x_test)

        res.append(r2_score(y_test, y_pred))

np.array(res).sum() / 10