In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import classification_report 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from fwapi.film import Film

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('oceny.csv', parse_dates=['Data'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1192 entries, 0 to 1191
Data columns (total 10 columns):
ID                  1192 non-null int64
Tytuł polski        1192 non-null object
Tytuł oryginalny    904 non-null object
Rok produkcji       1192 non-null int64
Ulubione            3 non-null object
Ocena               1192 non-null object
Komentarz           0 non-null float64
Kraj produkcji      1192 non-null object
Gatunek             1192 non-null object
Data                1192 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 93.2+ KB


In [3]:
def label_encode(series):
    encoder = LabelEncoder()
    encoder.fit(series)
    return encoder.transform(series)

def multibinarize(series):
    mlb = MultiLabelBinarizer()
    series = series.str.split(",", n = 3)
    data = pd.DataFrame(mlb.fit_transform(series), columns=mlb.classes_, index=df.index)
    return mlb.classes_, data
    
def movie_info(id):
    try:
        film = Film.get_by_id(id)
        film.populate()
        return film.budget, film.boxoffice, film.topics_count
    except Exception:
        return None, None, None

df = data.drop(columns=['Komentarz'])
df = df[df.Ocena != 'brak oceny']

# comma-separated columns to OneHotEncode
gatunek_classes, gatunek_data = multibinarize(df.Gatunek)
kraj_classes, kraj_data = multibinarize(df['Kraj produkcji'])

df[gatunek_classes] = gatunek_data
df[kraj_classes] = kraj_data

df['Ocena'] = df.Ocena.astype('int')

# not needed - column will be delete anyway
df['Ulubione'] = label_encode(df.Ulubione.fillna(''))

# warning - takes long time (a lot filmweb api calls)
new_columns = ['budget', 'boxoffice', 'topics_count']
df[new_columns] = df.apply(lambda x: movie_info(x.ID), axis=1, result_type='expand')
df[new_columns] = df[new_columns].apply(lambda x: x.fillna(x.mean()), axis=0).astype(int)

# final drop of unnecessary columns
df = df.drop(columns=['ID', 'Gatunek', 'Kraj produkcji', 'Tytuł polski', 'Tytuł oryginalny', 'Data', 'Ulubione'])

df


Unnamed: 0,Rok produkcji,Ocena,Akcja,Biblijny,Dla młodzieży,Dokumentalizowany,Dokumentalny,Dramat,Dramat historyczny,Dramat sądowy,...,Szwajcaria,Szwecja,Turcja,USA,Wielka Brytania,Węgry,Włochy,budget,boxoffice,topics_count
0,2019,7,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,55000000,738575844,530
1,2019,6,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,52639423,172627908,89
2,2016,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,52639423,172627908,57
3,2019,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,52639423,172627908,136
4,2009,3,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,52639423,108333222,215
5,2019,5,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,52639423,172627908,24
6,2019,4,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,52639423,172627908,5
7,2019,4,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,5000000,60582764,28
8,2019,6,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,52639423,172627908,8
9,2019,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,52639423,172627908,58


## test + train split

In [4]:
X = df.drop(['Ocena'], axis=1)
y = df.loc[:, 'Ocena']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

print("Zbiór treningowy:", train_X.shape, train_y.shape)
print("Zbiór testowy:", test_X.shape, test_y.shape)


Zbiór treningowy: (952, 165) (952,)
Zbiór testowy: (238, 165) (238,)


## DecisionTreeClassifier

In [5]:
tree = DecisionTreeClassifier(
  max_depth=2 
)

rfe = RFE(tree, 4)
X = rfe.fit_transform(X, y)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

tree.fit(train_X, train_y)
predicted_y = tree.predict(test_X)

precision = precision_score(test_y, predicted_y, average="micro")
print("Precision: {:.2f}".format(precision))

accuracy = accuracy_score(test_y, predicted_y)
print("Accuracy: {:.2f}".format(accuracy))

fone_score = f1_score(test_y, predicted_y, average="micro")
print("F1: {:.2f}".format(fone_score))

print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Precision: 0.22
Accuracy: 0.22
F1: 0.22
Report : 
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00        24
           4       0.00      0.00      0.00        29
           5       0.00      0.00      0.00        48
           6       0.21      0.70      0.33        57
           7       0.24      0.26      0.25        47
           8       0.00      0.00      0.00        22
           9       0.00      0.00      0.00         6
          10       0.00      0.00      0.00         2

    accuracy                           0.22       238
   macro avg       0.05      0.10      0.06       238
weighted avg       0.10      0.22      0.13       238



  'precision', 'predicted', average, warn_for)


## RandomForestClassifier

In [6]:
classifier = RandomForestClassifier() 

classifier = classifier.fit(train_X, train_y) 
predicted_y = classifier.predict(test_X) 

print ('Accuracy Score : ', accuracy_score(test_y, predicted_y)) 
print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Accuracy Score :  0.19327731092436976
Report : 
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.11      0.08      0.10        24
           4       0.13      0.07      0.09        29
           5       0.14      0.10      0.12        48
           6       0.29      0.42      0.34        57
           7       0.18      0.23      0.21        47
           8       0.12      0.09      0.11        22
           9       0.00      0.00      0.00         6
          10       0.00      0.00      0.00         2

    accuracy                           0.19       238
   macro avg       0.10      0.10      0.10       238
weighted avg       0.17      0.19      0.18       238



  'precision', 'predicted', average, warn_for)


## KNeighborsClassifier

In [7]:
classifier = KNeighborsClassifier()

classifier = classifier.fit(train_X, train_y)
predicted_y = classifier.predict(test_X)

print ('Accuracy Score : ', accuracy_score(test_y, predicted_y)) 
print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Accuracy Score :  0.15966386554621848
Report : 
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.17      0.21      0.19        24
           4       0.18      0.07      0.10        29
           5       0.08      0.08      0.08        48
           6       0.19      0.26      0.22        57
           7       0.20      0.23      0.21        47
           8       0.12      0.05      0.07        22
           9       0.00      0.00      0.00         6
          10       0.00      0.00      0.00         2

    accuracy                           0.16       238
   macro avg       0.09      0.09      0.09       238
weighted avg       0.15      0.16      0.15       238



  'precision', 'predicted', average, warn_for)


## SVC

In [8]:
classifier = SVC()

classifier = classifier.fit(train_X, train_y)
predicted_y = classifier.predict(test_X)

print ('Accuracy Score : ', accuracy_score(test_y, predicted_y)) 
print ('Report : ') 
print (classification_report(test_y, predicted_y)) 

Accuracy Score :  0.21008403361344538
Report : 
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00        24
           4       0.00      0.00      0.00        29
           5       0.20      0.02      0.04        48
           6       0.26      0.16      0.20        57
           7       0.21      0.85      0.34        47
           8       0.00      0.00      0.00        22
           9       0.00      0.00      0.00         6
          10       0.00      0.00      0.00         2

    accuracy                           0.21       238
   macro avg       0.07      0.10      0.06       238
weighted avg       0.14      0.21      0.12       238



  'precision', 'predicted', average, warn_for)
