In [15]:
from os import chdir
chdir(r'C:\Users\Pia\OneDrive\Data Science\experiment_design\ex_2\CoE_dataset')
import xml.etree.ElementTree as et

from random import seed, sample, choice
import numpy as np
import pandas as pd
pd.options.display.max_columns = 34
pd.options.display.max_rows = 100

from metadata_ratings_helper_functions import load_data, get_dummies

from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier

## Use this chunk of code for loading the training data
We'll do one-hot encoding after feature selection

In [47]:
df = load_data('Dev').loc[:, ['language', 'year', 'genre', 'country', 'runtime', 'rated']]
df_test = load_data('Test').loc[:, ['language', 'year', 'genre', 'country', 'runtime', 'rated']]

labels = pd.read_excel(r'Dev_Set\dev_set_groundtruth_and_trailers.xls', index_col=0).goodforairplane

df_test.head(100)

Unnamed: 0_level_0,language,year,genre,country,runtime,rated
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10.000 Km,"Spanish, Catalan, English",2014,"Comedy, Drama, Romance",Spain,99.0,R
12 Years a Slave,English,2013,"Biography, Drama, History","USA, UK",134.0,R
21 Jump Street,English,2012,"Action, Comedy, Crime",USA,109.0,R
2 States,Hindi,2014,"Comedy, Drama, Romance",India,149.0,
Aanmodderfakker,Dutch,2014,"Comedy, Romance",Netherlands,100.0,
Alexander,English,2004,"Action, Adventure, Biography","Germany, USA, Netherlands, France, UK, Italy",175.0,R
"Alexander and the Terrible, Horrible, No Good, Very Bad Day",English,2014,"Comedy, Family",USA,81.0,PG
Alice in Wonderland,English,2010,"Adventure, Family, Fantasy",USA,108.0,PG
American Sniper,English,2014,"Action, Biography, Thriller",USA,132.0,R
Anastasia,"English, Russian, French",1997,"Animation, Adventure, Drama","USA, Canada, UK",94.0,G


## Use this for computing the metrics:

In [56]:
def get_clf(classifier):
        
    if classifier == 'knn':
        return KNeighborsClassifier()
        
    elif classifier == 'nearest_mean':
        return NearestCentroid()
        
    elif classifier == 'decision_tree':
        return DecisionTreeClassifier()
        
    elif classifier == 'logistic_regression':
        return LogisticRegression(solver='lbfgs')  # for not getting warnings
        
    elif classifier == 'svm':
        return SVC(kernel = 'rbf', gamma='auto')  # for avoiding warnings    
            
    elif classifier == 'bagging':
        return BaggingClassifier()
        
    elif classifier == 'random_forest':
        return RandomForestClassifier(n_estimators=10)  # for not getting warnings
        
    elif classifier == 'adaboost':
        return AdaBoostClassifier()
        
    elif classifier == 'gradient_boost':
        return GradientBoostingClassifier()
    

def compute_scores(X, y, classifier):    
    
    np.random.seed(1)  # scikit-learn uses numpy.random
    return cross_validate(get_clf(classifier), X, y, cv = 10, scoring = ['precision', 'recall', 'f1'])


def compute_preds(X, y, X_test, classifier, S):
    X_best = X.copy().loc[:, S]
    X_best = get_dummies(X_best, S)
    
    np.random.seed(1)
    preds_cv = cross_val_predict(get_clf(classifier), X_best, y, cv=10)
    
    clf = get_clf(classifier)
    clf.fit(X_best, y)
    
    X_best_test = X_test.copy().loc[:, S]
    X_best_test = get_dummies(X_best_test, S)
    
    preds = clf.predict(X_best_test)

    return preds_cv, preds

In [60]:
# something is still not working, if you run this chunk of code a second time, the features, that are elected are less and the 
# scores are lower
    
def LVW(X, y, X_test, K, classifier):
     
    original_features = list(X.columns)    
    f1_best = 0
    k = 0
    C = len(original_features)
        
    seed(1)  # seed for package 'random'
    while k < K:
        sample_sizes = range(1, C + 1)
        my_choice = choice(sample_sizes)
        S1 = sample(original_features, my_choice)
        C1 = len(S1)
        
        # take a subset of the original dataframe X:
        X1 = X.copy().loc[:, S1]
        
        X1 = get_dummies(X1, S1)
        
        scores = compute_scores(X1, y, classifier)
        f1 = np.mean(scores['test_f1'])
        
        if (f1 > f1_best) or (f1 == f1_best and C1 < C):
            k, f1_best, C, S  = 0, f1, C1, S1
            precision, recall = np.mean(scores['test_precision']), np.mean(scores['test_recall'])
        else:
            k += 1
        
    # get a string: 
    used_features = S[0]
    for feature in S[1:]:
        used_features += ', ' + feature
    

    predictions_cv, predictions = compute_preds(X, y, X_test, classifier, S)

    return pd.DataFrame({'classifier': [classifier], 'used_features': [used_features], 'precision': [precision], 
                         'recall': [recall], 'f1': f1_best}, index=['classifier']), predictions_cv, predictions



# as the decision tree does not need the LVW, we compute it first without the LVW: 
df_with_dummies = get_dummies(df)
df_with_dummies_test = get_dummies(df_test)
df_with_dummies_test = df_with_dummies_test.fillna(df_with_dummies_test.mean())
scores_tree = compute_scores(df_with_dummies, labels, 'decision_tree')

scores = pd.DataFrame({'classifier': ['decision_tree'], 'used_features': ['all'], 
                       'precision': [np.mean(scores_tree['test_precision'])], 
                       'recall': [np.mean(scores_tree['test_recall'])], 'f1': np.mean(scores_tree['test_f1'])},
                        index=['classifier'])

preds_tree, preds_tree_test = compute_preds(df_with_dummies, labels, df_with_dummies_test, 'decision_tree', df_with_dummies.columns)
preds = pd.DataFrame(preds_tree, index=labels.index, columns=['decision_tree'])

for clf in ['knn', 'nearest_mean', 'logistic_regression', 'svm', 'bagging', 'random_forest', 'adaboost', 'gradient_boost']: 
    scores_tmp, preds_cv_tmp, preds_tmp = LVW(df, labels, df_test, 10, clf)
    scores = scores.append(scores_tmp)
    preds = preds.merge(pd.DataFrame(preds_cv_tmp, index=labels.index, columns=[clf]), 
                        left_index=True, right_index=True)
    
preds.to_csv('predictions_metadata_dev.csv')
preds

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
## afterwards we'll do the same thing for the test set
#data_set = 'Dev'
#
## get the columnnames:
#xtree = et.parse(r'Dev_Set\XML\A_Fish_Called_Wanda.xml')
#xroot = xtree.getroot()
#columns = list(xroot.find('movie').keys())
#df = pd.DataFrame(columns = columns)
#
#for movie_name in os.listdir(data_set + '_Set/XML'):  
#    xtree = et.parse(data_set + '_Set/XML/' + movie_name)
#    xroot = xtree.getroot()
#    instance = []
#    for element in columns:
#        if xroot[0] is not None: # and element != 'goodforairplane': # feature "good for airplane" is not filled in 
#            instance.append(xroot.find('movie').get(element))
#        else:
#            pass
#    if len(instance) < len(columns):
#        instance.append(movie_name[:-3])
#    my_series = pd.Series(instance, index = columns)
#    df = df.append(my_series, ignore_index=True)
#
#df.set_index('title', inplace=True)
#
#df = df.loc[:, ['language', 'year', 'genre', 'country', 'runtime', 'rated']]
#
## make the feature 'runtime' numeric:
#df.runtime = df.runtime.apply(lambda x: x[:-4])
#df.runtime = pd.to_numeric(df.runtime, errors='coerce')
#
## bring the entries of 'rated' which were not filled out into a unique shape:
#df.rated.replace(['NOT RATED', 'UNRATED'], 'N/A', inplace=True)
#
#df.year = pd.to_numeric(df.year, errors='coerce')
#
#labels = pd.read_excel(r'Dev_Set\dev_set_groundtruth_and_trailers.xls', index_col=0).goodforairplane
#df.head(100)





#def get_dummies(df, selected_features=['language', 'genre', 'country', 'rated']):
#    
#    df_tmp = df.copy()
#    for feature in set(selected_features).intersection(['language', 'genre', 'country', 'rated']):  # we only get dummy 
#                                                                                                     # variables for categorical
#                                                                                                      # data
#        # split the variables with various entries: 
#        one_hot = df.copy().loc[:, feature].str.split(', ', expand=True).stack()
#        
#        one_hot = pd.get_dummies(one_hot, prefix=feature, drop_first=True).groupby(level=0).sum()
#        df_tmp = df_tmp.drop(feature, axis=1)
#        df_tmp = df_tmp.merge(one_hot, left_index=True, right_index=True)
#    
#    return df_tmp

In [None]:
#def check_unique_entries(df, sort=True):
#    # columns.remove('title')
#    for column in df.columns:
#        unique_entries = df[column].unique()
#        if sort:
#            unique_entries.sort()
#        print(column, len(unique_entries), '\n', unique_entries)

#df_train.runtime = df_train.runtime.apply(lambda x: x[:-4])
#df_train.BoxOffice = df_train.BoxOffice.apply(lambda x: x if x!=x else x[1:-3] + x[len(x)-2:])
#df_train.BoxOffice = df_train.BoxOffice.apply(lambda x: x if x!=x else x[:-1] + '0'*5 if x[-1]=='M' else x[:-1] + '0'*2)
#df_train.BoxOffice

#df_train.drop(['released', 'metascore', 'imdbRating', 'imdbVotes', 'tomatoMeter', 'tomatoRating'])

#df_to_int = df_train.loc[:, ['year', 'runtime', 'metascore', 'imdbVotes', 'tomatoMeter', 'tomatoReviews', 'tomatoFresh', 
#                          'tomatoRotten', 'tomatoUserMeter', 'tomatoUserReviews', 'BoxOffice']]

#def df_from_xml(data_set, columns):
#    df = pd.DataFrame(columns = columns)
#    for movie_name in os.listdir(data_set + '_Set/XML'):  
#        xtree = et.parse(data_set + '_Set/XML/' + movie_name)
#        xroot = xtree.getroot()
#        instance = []
#        for element in columns:
#            if xroot[0] is not None: # and element != 'goodforairplane': # feature "good for airplane" is not filled in 
#                instance.append(xroot.find('movie').get(element))
#            else:
#                pass
#        if len(instance) < len(columns):
#            instance.append(movie_name[:-3])
#        my_series = pd.Series(instance, index = columns)
#        df = df.append(my_series, ignore_index=True)
#    # labels_to_drop = ['director', 'writer', 'actors', 'plot', 'awards', 'poster', 'tomatoConsensus', 'Website', 'imdbID', 'type']
#    # df_train.replace('N/A', np.nan, inplace=True)
#    return df
#
## df_test = df_from_xml('Test')
## df = df_dev.append(df_test, ignore_index=True)
#df = df.loc[:, ['title', 'language', 'year', 'genre', 'country', 'runtime', 'rated']]
#df.runtime = df.runtime.apply(lambda x: x[:-4])
#df.rated.replace(['NOT RATED', 'UNRATED'], 'N/A', inplace=True)
#df.year = pd.to_numeric(df.year, errors='coerce')
#df.runtime = pd.to_numeric(df.runtime, errors='coerce')
#    
##df_dev = df.iloc[:95]
##df_test = df.iloc[95:]
#labels = pd.read_excel(r'Dev_Set\dev_set_groundtruth_and_trailers.xls', index_col=0).goodforairplane
## labels_test = pd.read_csv(r'Test_Set\CoeTestLabels.csv', index_col=0).goodforairplanes
#df.set_index('title', inplace=True)
## df_dev = df_dev.merge(labels_dev, left_index=True, right_index=True)
## df_test.set_index('title', inplace=True)
## df_test = df_test.merge(labels_test, left_index=True, right_index=True)
## df_test.shape