In [160]:
import sys
import os
os.chdir(r'C:\Users\Pia\OneDrive\Data Science\experiment_design\ex_2\CoE_dataset')

import numpy as np
import pandas as pd
pd.options.display.max_columns = 34
pd.options.display.max_rows = 100

from datetime import datetime
import xml.etree.ElementTree as et
from random import seed, sample, choice
import sklearn as sk
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

modules imported successfully


## Use this chunk of code for loading the training data
We'll do one-hot encoding after feature selection

In [161]:
# afterwards we'll do the same thing for the test set
data_set = 'Dev'

# get the columnnames:
xtree = et.parse(r'Dev_Set\XML\A_Fish_Called_Wanda.xml')
xroot = xtree.getroot()
columns = list(xroot.find('movie').keys())
df = pd.DataFrame(columns = columns)

for movie_name in os.listdir(data_set + '_Set/XML'):  
    xtree = et.parse(data_set + '_Set/XML/' + movie_name)
    xroot = xtree.getroot()
    instance = []
    for element in columns:
        if xroot[0] is not None: # and element != 'goodforairplane': # feature "good for airplane" is not filled in 
            instance.append(xroot.find('movie').get(element))
        else:
            pass
    if len(instance) < len(columns):
        instance.append(movie_name[:-3])
    my_series = pd.Series(instance, index = columns)
    df = df.append(my_series, ignore_index=True)

df.set_index('title', inplace=True)

df = df.loc[:, ['language', 'year', 'genre', 'country', 'runtime', 'rated']]

# make the feature 'runtime' numeric:
df.runtime = df.runtime.apply(lambda x: x[:-4])
df.runtime = pd.to_numeric(df.runtime, errors='coerce')

# bring the entries of 'rated' which were not filled out into a unique shape:
df.rated.replace(['NOT RATED', 'UNRATED'], 'N/A', inplace=True)

df.year = pd.to_numeric(df.year, errors='coerce')

labels = pd.read_excel(r'Dev_Set\dev_set_groundtruth_and_trailers.xls', index_col=0).goodforairplane
df.head(100)

Unnamed: 0_level_0,language,year,genre,country,runtime,rated
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
American Gangster,English,2007,"Biography, Crime, Drama","USA, UK",157,R
American Pie,English,1999,"Comedy, Romance",USA,95,R
Andaz Apna Apna,Hindi,1994,"Comedy, Family, Romance",India,160,PG
Anna Karenina,English,2012,"Drama, Romance",UK,129,R
A Fish Called Wanda,"English, Italian, Russian",1988,"Comedy, Crime","USA, UK",108,R
A Goofy Movie,English,1995,"Animation, Adventure, Comedy",USA,78,G
A Million Ways to Die in the West,"English, Navajo, Mandarin",2014,"Comedy, Western",USA,116,R
A Single Man,"English, Spanish",2009,Drama,USA,99,R
Babar: The Movie,"English, French",1989,"Animation, Adventure, Family","Canada, France",70,G
Bhoothnath Returns,Hindi,2014,"Comedy, Drama, Fantasy",India,155,


## Use this for computing the metrics:

In [162]:
def compute_scores(X, y, classifier):
    
    if classifier == 'knn':
        clf = KNeighborsClassifier()
        
    elif classifier == 'nearest_mean':
        clf = NearestCentroid()
        
    elif classifier == 'decision_tree':
        clf = DecisionTreeClassifier()
        
    elif classifier == 'logistic_regression':
        clf = LogisticRegression(solver='lbfgs')  # for not getting warnings
        
    elif classifier == 'svm':
        clf = SVC(kernel = 'rbf', gamma='auto')  # for avoiding warnings    
            
    elif classifier == 'bagging':
        clf = BaggingClassifier()
        
    elif classifier == 'random_forest':
        clf = RandomForestClassifier(n_estimators=10)  # for not getting warnings
        
    elif classifier == 'adaboost':
        clf = AdaBoostClassifier()
        
    elif classifier == 'gradient_boost':
        clf = GradientBoostingClassifier()
        
    else:
        print('Parameter \'classifier\' needs to be one of the following strings:' )
        print('\'knn\', \'decision_tree\', \'logisticregression\', \'svm\', \'random_forest\', \'adaboost\', \'gradient_boost\'')
        return
    
    np.random.seed(1)  # scikit-learn uses numpy.random
    return cross_validate(clf, X, y, cv = 10, scoring = ['precision', 'recall', 'f1'])

In [163]:
# something is still not working, if you run this chunk of code a second time, the features, that are elected are less and the 
# scores are lower

def get_dummies(df, selected_features=['language', 'genre', 'country', 'rated']):
    
    df_tmp = df.copy()
    for feature in set(selected_features).intersection(['language', 'genre', 'country', 'rated']):  # we only get dummy 
                                                                                                     # variables for categorical
                                                                                                      # data
        # split the variables with various entries: 
        one_hot = df.copy().loc[:, feature].str.split(', ', expand=True).stack()
        
        one_hot = pd.get_dummies(one_hot, prefix=feature, drop_first=True).groupby(level=0).sum()
        df_tmp = df_tmp.drop(feature, axis=1)
        df_tmp = df_tmp.merge(one_hot, left_index=True, right_index=True)
    
    return df_tmp

    
def LVW(X, y, K, classifier):
     
    original_features = list(X.columns)    
    f1_best = 0
    k = 0
    C = len(original_features)
        
    seed(1)  # seed for package 'random'
    while k < K:
        sample_sizes = range(1, C + 1)
        my_choice = choice(sample_sizes)
        S1 = sample(original_features, my_choice)
        C1 = len(S1)
        
        # take a subset of the original dataframe X:
        X1 = X.copy().loc[:, S1]
        
        X1 = get_dummies(X1, S1)
        
        scores = compute_scores(X1, y, classifier)
        f1 = np.mean(scores['test_f1'])
        
        if (f1 > f1_best) or (f1 == f1_best and C1 < C):
            k, f1_best, C, S  = 0, f1, C1, S1
            precision, recall = np.mean(scores['test_precision']), np.mean(scores['test_recall'])
        else:
            k += 1
        
    # get a string: 
    used_features = S[0]
    for feature in S[1:]:
        used_features += ', ' + feature
    print(S)

    return pd.DataFrame({'classifier': [classifier], 'used_features': [used_features], 'precision': [precision], 
                         'recall': [recall], 'f1': f1_best}, index=['classifier'])


# as the decision tree does not need the LVW, we compute it first without the LVW: 
scores_tree = compute_scores(get_dummies(df), labels, 'decision_tree')

scores = pd.DataFrame({'classifier': ['decision_tree'], 'used_features': ['all'], 
                       'precision': [np.mean(scores_tree['test_precision'])], 
                       'recall': [np.mean(scores_tree['test_recall'])], 'f1': np.mean(scores_tree['test_f1'])},
                        index=['classifier'])

for clf in ['knn', 'nearest_mean', 'logistic_regression', 'svm', 'bagging', 'random_forest', 'adaboost', 'gradient_boost']: 
    scores = scores.append(LVW(df, labels, 10, clf))
scores

['runtime', 'language']
['language', 'country']
['country', 'rated']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['language']
['language', 'country']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['runtime', 'language']
['genre', 'year']
['genre']
['language']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['rated']
['country']
['country']
['runtime']
['country']
['runtime', 'year']
['year', 'rated']
['genre', 'language']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['genre', 'year']
['runtime', 'language']
['language', 'country']
['country', 'rated']
['language']
['language', 'country']
['runtime', 'language']
['genre', 'year']
['genre']
['language']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['rated']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['country']
['runtime', 'language']
['runtime', 'language']




['language', 'country']
['country', 'rated']
['language']
['language', 'country']
['runtime', 'language']




['genre', 'year']
['genre']
['language']
['rated']
['country']
['runtime', 'language']
['runtime', 'language']
['language', 'country']
['country', 'rated']
['language']
['language']
['country']
['rated']
['genre']
['runtime']
['genre']
['language']
['rated']
['country']
['country']
['language']
['runtime', 'language']
['language', 'country']
['country', 'rated']
['language']
['language', 'country']
['runtime', 'language']
['genre', 'year']
['genre']
['language']
['rated']
['country']
['country']
['runtime']
['country', 'rated']
['runtime', 'language']
['language', 'country']
['country', 'rated']
['language']
['language', 'country']
['runtime', 'language']
['genre', 'year']
['genre']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['language']
['rated']
['country']
['country']
['language', 'country']
['runtime', 'language']
['language', 'country']
['country', 'rated']
['language']
['language', 'country']
['runtime', 'language']
['genre', 'year']
['genre']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['language']
['rated']
['country']
['runtime', 'language']
['runtime', 'language']
['language', 'country']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['country', 'rated']
['language']
['language', 'country']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


['runtime', 'language']
['genre', 'year']
['genre']
['language']
['rated']
['country']
['runtime', 'language']


Unnamed: 0,classifier,used_features,precision,recall,f1
classifier,decision_tree,all,0.595595,0.616667,0.593802
classifier,knn,"genre, year",0.66631,0.666667,0.639066
classifier,nearest_mean,"runtime, language",0.62369,0.61,0.601805
classifier,logistic_regression,"runtime, language",0.596032,0.73,0.652597
classifier,svm,language,0.54798,1.0,0.707843
classifier,bagging,"country, rated",0.592857,0.593333,0.58073
classifier,random_forest,"language, country",0.657897,0.596667,0.590458
classifier,adaboost,"runtime, language",0.725833,0.633333,0.646378
classifier,gradient_boost,"runtime, language",0.709167,0.673333,0.647529


In [None]:
#def check_unique_entries(df, sort=True):
#    # columns.remove('title')
#    for column in df.columns:
#        unique_entries = df[column].unique()
#        if sort:
#            unique_entries.sort()
#        print(column, len(unique_entries), '\n', unique_entries)

#df_train.runtime = df_train.runtime.apply(lambda x: x[:-4])
#df_train.BoxOffice = df_train.BoxOffice.apply(lambda x: x if x!=x else x[1:-3] + x[len(x)-2:])
#df_train.BoxOffice = df_train.BoxOffice.apply(lambda x: x if x!=x else x[:-1] + '0'*5 if x[-1]=='M' else x[:-1] + '0'*2)
#df_train.BoxOffice

#df_train.drop(['released', 'metascore', 'imdbRating', 'imdbVotes', 'tomatoMeter', 'tomatoRating'])

#df_to_int = df_train.loc[:, ['year', 'runtime', 'metascore', 'imdbVotes', 'tomatoMeter', 'tomatoReviews', 'tomatoFresh', 
#                          'tomatoRotten', 'tomatoUserMeter', 'tomatoUserReviews', 'BoxOffice']]

#def df_from_xml(data_set, columns):
#    df = pd.DataFrame(columns = columns)
#    for movie_name in os.listdir(data_set + '_Set/XML'):  
#        xtree = et.parse(data_set + '_Set/XML/' + movie_name)
#        xroot = xtree.getroot()
#        instance = []
#        for element in columns:
#            if xroot[0] is not None: # and element != 'goodforairplane': # feature "good for airplane" is not filled in 
#                instance.append(xroot.find('movie').get(element))
#            else:
#                pass
#        if len(instance) < len(columns):
#            instance.append(movie_name[:-3])
#        my_series = pd.Series(instance, index = columns)
#        df = df.append(my_series, ignore_index=True)
#    # labels_to_drop = ['director', 'writer', 'actors', 'plot', 'awards', 'poster', 'tomatoConsensus', 'Website', 'imdbID', 'type']
#    # df_train.replace('N/A', np.nan, inplace=True)
#    return df
#
## df_test = df_from_xml('Test')
## df = df_dev.append(df_test, ignore_index=True)
#df = df.loc[:, ['title', 'language', 'year', 'genre', 'country', 'runtime', 'rated']]
#df.runtime = df.runtime.apply(lambda x: x[:-4])
#df.rated.replace(['NOT RATED', 'UNRATED'], 'N/A', inplace=True)
#df.year = pd.to_numeric(df.year, errors='coerce')
#df.runtime = pd.to_numeric(df.runtime, errors='coerce')
#    
##df_dev = df.iloc[:95]
##df_test = df.iloc[95:]
#labels = pd.read_excel(r'Dev_Set\dev_set_groundtruth_and_trailers.xls', index_col=0).goodforairplane
## labels_test = pd.read_csv(r'Test_Set\CoeTestLabels.csv', index_col=0).goodforairplanes
#df.set_index('title', inplace=True)
## df_dev = df_dev.merge(labels_dev, left_index=True, right_index=True)
## df_test.set_index('title', inplace=True)
## df_test = df_test.merge(labels_test, left_index=True, right_index=True)
## df_test.shape