## Importing libraries

In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
pd.set_option("max_rows", None)
pd.set_option("max_columns", None)

## Creating Data Frames

In [2]:
result = pd.read_csv('songs.csv')
del result["Unnamed: 0"]
pred_df = pd.read_csv('prediction table.csv')
replace_map={'male':0,'female':1,'group':2}
pred_df["Gender"].replace(replace_map,inplace=True)
pred_df

Unnamed: 0,Year,Country,Artist,Gender,Song,Language
0,1968,Norway,Odd Børre,0,"""Jeg har aldri vært så glad i noen som deg""",Norwegian
1,1974,France,Dani,1,"""La vie à vingt-cinq ans""",French
2,1976,Germany,Tony Marshall,0,"""Der Star""",German
3,1979,Turkey,Maria Rita Epik & 21. Peron,2,"""Seviyorum""",Turkish
4,1982,Greece,Themis Adamadidis,0,"""Sarantapente kopelies""",Greek
5,1986,Greece,Polina,1,"""Wagon-lit""",Greek
6,1988,Cyprus,Yiannis Dimitrou,0,"""Thimame""",Greek
7,1990,Austria,Duett,2,"""Das Beste""",German
8,1999,Bosnia and Herzegovina,Hari Mata Hari,2,"""Starac i more""",Bosnian
9,1999,Germany,Corinna May,1,"""Hör den Kindern einfach zu""",German


## Logistic Regression

In [3]:
df_log_copy=result[["Year","Gender","Was in final"]].copy()
TRAINING_FEATURES = df_log_copy.columns[df_log_copy.columns != 'Was in final']
TARGET_FEATURE = 'Was in final'

X = df_log_copy[TRAINING_FEATURES]
y = df_log_copy[TARGET_FEATURE]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=91)
print("Initial amount of samples: #{}".format(X.shape[0]))
print("Number of training samples: #{}".format(X_train.shape[0]))
print("Number of test samples: #{}".format(X_test.shape[0]))

min_max_scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled_in_range = min_max_scaler.fit_transform(X_train)
print("Min Value: ", X_train_scaled_in_range.min(axis=0))
print("Max Value: ", X_train_scaled_in_range.max(axis=0))
X_test_scaled_in_range = min_max_scaler.transform(X_test)

clf_model = LogisticRegression().fit(X_train, y_train)
y_log_pred=clf_model.predict(X_test)
resLogDF=pd.DataFrame({"Actual":y_test,"Predicted":y_log_pred})

resLogDF["correct"]=abs((resLogDF["Actual"]^resLogDF["Predicted"])-1)
resLogDF[resLogDF["correct"]==1]
print("correct:",len(resLogDF[resLogDF["correct"]==1]))
print("total:",len(resLogDF))
print("correct %:",len(resLogDF[resLogDF["correct"]==1])/len(resLogDF))
resLogDF

Initial amount of samples: #1589
Number of training samples: #1112
Number of test samples: #477
Min Value:  [0. 0.]
Max Value:  [1. 1.]
correct: 387
total: 477
correct %: 0.8113207547169812


Unnamed: 0,Actual,Predicted,correct
1431,1,1,1
1519,1,1,1
1447,1,1,1
349,1,1,1
88,1,1,1
436,1,1,1
924,0,1,0
1171,1,1,1
785,1,1,1
998,0,1,0


In [4]:
metrics.confusion_matrix(y_test, y_log_pred)

array([[ 15,  68],
       [ 22, 372]], dtype=int64)

In [5]:
PREDICTION_FEATURES_LOG=pred_df[["Year","Gender"]].copy()

X_pred_log = PREDICTION_FEATURES_LOG

min_max_scaler = MinMaxScaler(feature_range=(0, 1))
X_predict_scaled_in_range = min_max_scaler.fit_transform(X_pred_log)
print("Min Value: ", X_predict_scaled_in_range.min(axis=0))
print("Max Value: ", X_predict_scaled_in_range.max(axis=0))

y_pred_Log=clf_model.predict(X_pred_log)
predictedLogisticalDF=pd.DataFrame({"Year":pred_df["Year"],"Gender":pred_df["Gender"],"Predicted":y_pred_Log})
predictedLogisticalDF

Min Value:  [0. 0.]
Max Value:  [1. 1.]


Unnamed: 0,Year,Gender,Predicted
0,1968,0,1
1,1974,1,1
2,1976,0,1
3,1979,2,1
4,1982,0,1
5,1986,1,1
6,1988,0,1
7,1990,2,1
8,1999,2,1
9,1999,1,1


## Linear Regression

In [6]:
winnercondition=result["Placing in finals"] == 1
winners_df = result[winnercondition].copy()
winners_df.reset_index(drop=True, inplace=True)
winner_points=winners_df[['Year','Gender','Total score']]

lr = linear_model.LinearRegression()
x_fit = winner_points[['Year','Gender']]
y_fit = winner_points['Total score']
lr.fit(X=x_fit, y=y_fit)

print("Slope:",lr.coef_)
print("Intercept:",lr.intercept_)
print("R2:",lr.score(x_fit,y_fit))
print("R2:",r2_score(y_fit,lr.predict(x_fit.values)))

PREDICTION_FEATURES_LIN=pred_df[["Year","Gender"]].copy()

x_pred_lin = PREDICTION_FEATURES_LIN
y_pred_lin=lr.predict(x_pred_lin)
predictedLinearDF=pd.DataFrame({"Year":pred_df["Year"],"Gender":pred_df["Gender"],"Predicted":y_pred_lin})
predictedLinearDF

Slope: [  6.7295686  -23.11052216]
Intercept: -13172.657329566313
R2: 0.747022322839864
R2: 0.747022322839864


Unnamed: 0,Year,Gender,Predicted
0,1968,0,71.133669
1,1974,1,88.400559
2,1976,0,124.970218
3,1979,2,98.93788
4,1982,0,165.34763
5,1986,1,169.155382
6,1988,0,205.725041
7,1990,2,172.963134
8,1999,2,233.529251
9,1999,1,256.639774
