In [19]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import classification_report 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from fwapi.film import Film
import seaborn as sns
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('oceny.csv', parse_dates=['Data'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1192 entries, 0 to 1191
Data columns (total 10 columns):
ID                  1192 non-null int64
Tytuł polski        1192 non-null object
Tytuł oryginalny    904 non-null object
Rok produkcji       1192 non-null int64
Ulubione            3 non-null object
Ocena               1192 non-null object
Komentarz           0 non-null float64
Kraj produkcji      1192 non-null object
Gatunek             1192 non-null object
Data                1192 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 93.2+ KB


In [3]:
def label_encode(series):
    encoder = LabelEncoder()
    encoder.fit(series)
    return encoder.transform(series)

def multibinarize(series):
    mlb = MultiLabelBinarizer()
    series = series.str.split(",", n = 3)
    data = pd.DataFrame(mlb.fit_transform(series), columns=mlb.classes_, index=df.index)
    return mlb.classes_, data
    
def movie_info(id):
    try:
        film = Film.get_by_id(id)
        film.populate()
        return film.budget, film.boxoffice, film.topics_count
    except Exception:
        return None, None, None

df = data.drop(columns=['Komentarz'])
df = df[df.Ocena != 'brak oceny']

# comma-separated columns to OneHotEncode
gatunek_classes, gatunek_data = multibinarize(df.Gatunek)
kraj_classes, kraj_data = multibinarize(df['Kraj produkcji'])

df[gatunek_classes] = gatunek_data
df[kraj_classes] = kraj_data

df['Ocena'] = df.Ocena.astype('int')

# not needed - column will be delete anyway
df['Ulubione'] = label_encode(df.Ulubione.fillna(''))

# warning - takes long time (a lot filmweb api calls)
new_columns = ['budget', 'boxoffice', 'topics_count']
df[new_columns] = df.apply(lambda x: movie_info(x.ID), axis=1, result_type='expand')
#df[new_columns] = df[new_columns].apply(lambda x: x.fillna(x.mean()), axis=0).astype(int)

# final drop of unnecessary columns
#df = df.drop(columns=['ID', 'Gatunek', 'Kraj produkcji', 'Tytuł polski', 'Tytuł oryginalny', 'Data', 'Ulubione'])

df.to_csv(r'oceny_decorated.csv')

df


Unnamed: 0,ID,Tytuł polski,Tytuł oryginalny,Rok produkcji,Ulubione,Ocena,Kraj produkcji,Gatunek,Data,Akcja,...,Szwajcaria,Szwecja,Turcja,USA,Wielka Brytania,Węgry,Włochy,budget,boxoffice,topics_count
0,810167,Joker,,2019,0,7,"Kanada, USA","Dramat, Kryminał, Akcja",2019-10-11,1,...,0,0,0,0,0,0,0,55000000.0,852031557.0,593.0
1,796158,"Podły, okrutny, zły","Extremely Wicked, Shockingly Evil and Vile",2019,0,6,USA,"Biograficzny, Kryminał, Thriller",2019-10-06,0,...,0,0,0,1,0,0,0,,,89.0
2,753119,Prosta historia o morderstwie,,2016,0,3,Polska,"Kryminał, Thriller",2019-10-05,0,...,0,0,0,0,0,0,0,,,58.0
3,797600,"Ciemno, prawie noc",,2019,0,2,Polska,"Kryminał, Thriller",2019-10-05,0,...,0,0,0,0,0,0,0,,,137.0
4,460220,Wyznania zakupoholiczki,Confessions of a Shopaholic,2009,0,3,USA,Komedia rom.,2019-10-04,0,...,0,0,0,1,0,0,0,,108333222.0,215.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187,9509,Shrek,,2001,0,6,USA,"Animacja, Familijny, Komedia",2011-08-04,0,...,0,0,0,1,0,0,0,60000000.0,484409218.0,1226.0
1188,837,Podziemny krąg,Fight Club,1999,0,7,"Niemcy, USA","Thriller, Psychologiczny",2011-08-04,0,...,0,0,0,0,0,0,0,63000000.0,100853753.0,1708.0
1189,936,Gladiator,,2000,0,4,"USA, Wielka Brytania",Dramat historyczny,2011-08-04,0,...,0,0,0,1,0,0,0,103000000.0,457640427.0,1641.0
1190,9136,Requiem dla snu,Requiem for a Dream,2000,0,7,USA,Dramat,2011-08-04,0,...,0,0,0,1,0,0,0,4500000.0,7390108.0,2438.0


In [93]:
df = pd.read_csv('oceny_decorated.csv', parse_dates=['Data'], index_col=0)

df = df.drop(columns=['ID', 'Gatunek', 'Kraj produkcji', 'Tytuł polski', 'Tytuł oryginalny', 'Data', 'Ulubione'])

df.topics_count = df.topics_count.fillna(0)

def make_groups(df):
    minidf = df.drop(columns=['budget', 'boxoffice'])
    kmeans = KMeans(n_jobs=-1, n_clusters=5, init='k-means++')
    kmeans.fit(minidf)
    return kmeans.predict(minidf)

def fill_mean(column):
    return df.groupby('group')[column].apply(lambda x: x.fillna(x.mean()))

df['group'] = make_groups(df)        
df['budget'] = fill_mean('budget')
df['boxoffice'] = fill_mean('boxoffice')

df = df.drop(columns=['group'])
df.to_csv(r'oceny_filled.csv')
df

Unnamed: 0,Rok produkcji,Ocena,Akcja,Biblijny,Dla młodzieży,Dokumentalizowany,Dokumentalny,Dramat,Dramat historyczny,Dramat sądowy,...,Szwajcaria,Szwecja,Turcja,USA,Wielka Brytania,Węgry,Włochy,budget,boxoffice,topics_count
0,2019,7,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5.500000e+07,8.520316e+08,593.0
1,2019,6,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,3.425190e+07,8.281214e+07,89.0
2,2016,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.425190e+07,8.281214e+07,58.0
3,2019,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.425190e+07,8.281214e+07,137.0
4,2009,3,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,3.425190e+07,1.083332e+08,215.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187,2001,6,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,6.000000e+07,4.844092e+08,1226.0
1188,1999,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,6.300000e+07,1.008538e+08,1708.0
1189,2000,4,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1.030000e+08,4.576404e+08,1641.0
1190,2000,7,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4.500000e+06,7.390108e+06,2438.0


## test + train split

In [94]:
X = df.drop(['Ocena'], axis=1)
y = df.loc[:, 'Ocena']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

print("Zbiór treningowy:", train_X.shape, train_y.shape)
print("Zbiór testowy:", test_X.shape, test_y.shape)


Zbiór treningowy: (952, 165) (952,)
Zbiór testowy: (238, 165) (238,)


## DecisionTreeClassifier

In [95]:
tree = DecisionTreeClassifier(
  max_depth=2 
)

rfe = RFE(tree, 4)
X = rfe.fit_transform(X, y)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

tree.fit(train_X, train_y)
predicted_y = tree.predict(test_X)

precision = precision_score(test_y, predicted_y, average="micro")
print("Precision: {:.2f}".format(precision))

accuracy = accuracy_score(test_y, predicted_y)
print("Accuracy: {:.2f}".format(accuracy))

fone_score = f1_score(test_y, predicted_y, average="micro")
print("F1: {:.2f}".format(fone_score))

print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Precision: 0.29
Accuracy: 0.29
F1: 0.29
Report : 
              precision    recall  f1-score   support

           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        23
           5       0.00      0.00      0.00        45
           6       0.29      0.72      0.41        67
           7       0.28      0.36      0.32        55
           8       0.00      0.00      0.00        15
           9       0.00      0.00      0.00         7
          10       0.00      0.00      0.00         1

    accuracy                           0.29       238
   macro avg       0.06      0.12      0.08       238
weighted avg       0.15      0.29      0.19       238



  'precision', 'predicted', average, warn_for)


## RandomForestClassifier

In [85]:
classifier = RandomForestClassifier() 

classifier = classifier.fit(train_X, train_y) 
predicted_y = classifier.predict(test_X) 

print ('Accuracy Score : ', accuracy_score(test_y, predicted_y)) 
print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Accuracy Score :  0.19747899159663865
Report : 
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         3
           3       0.16      0.13      0.15        30
           4       0.12      0.12      0.12        24
           5       0.16      0.26      0.20        38
           6       0.23      0.21      0.22        63
           7       0.25      0.30      0.27        50
           8       0.22      0.09      0.12        23
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         1

    accuracy                           0.20       238
   macro avg       0.12      0.11      0.11       238
weighted avg       0.19      0.20      0.19       238



  'precision', 'predicted', average, warn_for)


## KNeighborsClassifier

In [86]:
classifier = KNeighborsClassifier()

classifier = classifier.fit(train_X, train_y)
predicted_y = classifier.predict(test_X)

print ('Accuracy Score : ', accuracy_score(test_y, predicted_y)) 
print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Accuracy Score :  0.17647058823529413
Report : 
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.17      0.33      0.22         3
           3       0.18      0.17      0.17        30
           4       0.10      0.08      0.09        24
           5       0.17      0.29      0.21        38
           6       0.17      0.17      0.17        63
           7       0.22      0.20      0.21        50
           8       0.25      0.09      0.13        23
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         1

    accuracy                           0.18       238
   macro avg       0.12      0.13      0.12       238
weighted avg       0.18      0.18      0.17       238



  'precision', 'predicted', average, warn_for)


## SVC

In [87]:
classifier = SVC()

classifier = classifier.fit(train_X, train_y)
predicted_y = classifier.predict(test_X)

print ('Accuracy Score : ', accuracy_score(test_y, predicted_y)) 
print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Accuracy Score :  0.2647058823529412
Report : 
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00        30
           4       0.00      0.00      0.00        24
           5       0.33      0.03      0.05        38
           6       0.26      0.97      0.41        63
           7       0.33      0.02      0.04        50
           8       0.00      0.00      0.00        23
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         1

    accuracy                           0.26       238
   macro avg       0.09      0.10      0.05       238
weighted avg       0.19      0.26      0.13       238



  'precision', 'predicted', average, warn_for)
