In [6]:
from os import chdir
chdir(r'C:\Users\Pia\OneDrive\Data Science\experiment_design\ex_2\CoE_dataset')
import xml.etree.ElementTree as et

from random import seed, sample, choice
import numpy as np
import pandas as pd
pd.options.display.max_columns = 34
pd.options.display.max_rows = 250

from metadata_ratings_helper_functions import load_data, get_dummies

from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier

## Use this chunk of code for loading the training data
We'll do one-hot encoding after feature selection

In [7]:
df = load_data('Dev').loc[:, ['language', 'year', 'genre', 'country', 'runtime', 'rated']]
df_test = load_data('Test').loc[:, ['language', 'year', 'genre', 'country', 'runtime', 'rated']]

labels = pd.read_excel(r'Dev_Set\dev_set_groundtruth_and_trailers.xls', index_col=0).goodforairplane

## Use this for computing the metrics:

In [8]:
def get_clf(classifier):
        
    if classifier == 'knn':
        return KNeighborsClassifier()
        
    elif classifier == 'nearest_mean':
        return NearestCentroid()
        
    elif classifier == 'decision_tree':
        return DecisionTreeClassifier()
        
    elif classifier == 'logistic_regression':
        return LogisticRegression(solver='lbfgs')  # for not getting warnings
        
    elif classifier == 'svm':
        return SVC(kernel = 'rbf', gamma='auto')  # for avoiding warnings    
            
    elif classifier == 'bagging':
        return BaggingClassifier()
        
    elif classifier == 'random_forest':
        return RandomForestClassifier(n_estimators=10)  # for not getting warnings
        
    elif classifier == 'adaboost':
        return AdaBoostClassifier()
        
    elif classifier == 'gradient_boost':
        return GradientBoostingClassifier()
    

def compute_scores(X, y, classifier):    
    
    np.random.seed(1)  # scikit-learn uses numpy.random
    
    return cross_validate(get_clf(classifier), X, y, cv=10, scoring=['precision', 'recall', 'f1'])


def compute_preds(X, y, X_test, classifier, S):
    
    X_best = X.copy().loc[:, S]
    X_best_test = X_test.copy().loc[:, S]
    
    X_dev_test = X_best.append(X_best_test)
    X_dummies = get_dummies(X_dev_test, S)
    
    X_dev = X_dummies.iloc[:95]
    X_test = X_dummies.iloc[95:]
        
    np.random.seed(1)
    preds_cv = cross_val_predict(get_clf(classifier), X_dev, y, cv=10)
    
    clf = get_clf(classifier)
    clf.fit(X_dev, y)   
    
    X_test.fillna(X_best_test.mean(), inplace=True)
    preds = clf.predict(X_test)

    return preds_cv, preds


def LVW(X, y, X_test, K, classifier):
     
    original_features = list(X.columns)    
    f1_best = 0
    k = 0
    C = len(original_features)
        
    seed(1)  # seed for package 'random'
    while k < K:
        sample_sizes = range(1, C + 1)
        my_choice = choice(sample_sizes)
        S1 = sample(original_features, my_choice)
        C1 = len(S1)
        
        # take a subset of the original dataframe X:
        X1 = X.copy().loc[:, S1]
        
        X1 = get_dummies(X1, S1)
        
        scores = compute_scores(X1, y, classifier)
        f1 = np.mean(scores['test_f1'])
        
        if (f1 > f1_best) or (f1 == f1_best and C1 < C):
            k, f1_best, C, S  = 0, f1, C1, S1
            precision, recall = np.mean(scores['test_precision']), np.mean(scores['test_recall'])
        else:
            k += 1
        
    # get a string: 
    used_features = S[0]
    for feature in S[1:]:
        used_features += ', ' + feature
    

    predictions_cv, predictions = compute_preds(X, y, X_test, classifier, S)

    return pd.DataFrame({'classifier': [classifier], 'used_features': [used_features], 'precision': [precision], 
                         'recall': [recall], 'f1': f1_best}, index=['classifier']), predictions_cv, predictions

In [9]:
# as the decision tree does not need the LVW, we compute it first without the LVW: 
df_with_dummies = get_dummies(df)
scores_tree = compute_scores(df_with_dummies, labels, 'decision_tree')

scores = pd.DataFrame({'classifier': ['decision_tree'], 'used_features': ['all'], 
                       'precision': [np.mean(scores_tree['test_precision'])], 
                       'recall': [np.mean(scores_tree['test_recall'])], 'f1': np.mean(scores_tree['test_f1'])},
                        index=['classifier'])

preds_tree, preds_tree_test = compute_preds(df, labels, df_test, 'decision_tree', df.columns)

preds_dev = pd.DataFrame(preds_tree, index=labels.index, columns=['decision_tree'])
preds_test = pd.DataFrame(preds_tree_test, index=df_test.index, columns=['decision_tree'])



for clf in ['knn', 'nearest_mean', 'logistic_regression', 'svm', 'bagging', 'random_forest', 'adaboost', 'gradient_boost']: 
    scores_tmp, preds_cv_tmp, preds_tmp = LVW(df, labels, df_test, 10, clf)
    scores = scores.append(scores_tmp)
    preds_dev = preds_dev.merge(pd.DataFrame(preds_cv_tmp, index=labels.index, columns=[clf]), 
                        left_index=True, right_index=True)
    preds_test = preds_test.merge(pd.DataFrame(preds_tmp, index=df_test.index, columns=[clf]), 
                        left_index=True, right_index=True)
 
# scores.to_csv('table_2_metadata.csv')
# preds_dev.to_csv('predictions_metadata_dev.csv')
preds_test.to_csv('predictions_metadata_test.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
