# Chalearn Data Analysis

In [2]:
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix
from extract_feat_stage4 import createXY
import numpy as np

## Import Training Data
* Features extracted using a 3D RexNeXt-101 model, pretrained on the Kitenics dataset
* Features are provided as a 2048 dimensional vectors for each 16 frame segment of a video. These were concatenated and the mean value was kept, producing a single 2048 dimensional vector for one entire video
* Classification is performed for the 'extraversion' label since it has the most datasamples after binarizing at values > 0.5 with 55% of the data samples being positive examples (the rest of the labels has at best 43% positive examples) accross the entire dataset of 6000 videos. 
* Keeping the student dataset in mind, the dataset was reduced to 960 videos. After reducing the dataset, the percentage of positive examples was 53.5%

In [3]:
#import features (X) and labels (Y)
print('loading...')
X, Y = createXY()
print('X-dim = ', X.ndim)
print('Y-dim = ', Y.ndim)
print(len(X))
print(len(Y))

loading...
X-dim =  2
Y-dim =  1
960
960


## Training using SVM
* Using K fold cross correlation with  K = 5 to get accuracy and normalized confusion matrix

In [8]:
k_fold = KFold(n_splits=5)
svc = svm.SVC()

accuracy = np.mean(cross_val_score(svc, X, Y, cv=k_fold,
                 scoring='precision_macro'))
print("accuracy = ", round(accuracy, 2))

y_pred = cross_val_predict(svc, X, Y, cv=k_fold,)


accuracy =  0.64


## Classification report

In [10]:
from sklearn.metrics import classification_report
print (classification_report(Y,y_pred))

             precision    recall  f1-score   support

          0       0.61      0.59      0.60       446
          1       0.66      0.67      0.66       514

avg / total       0.64      0.64      0.64       960



## Confusion matrix

In [29]:
conf_mat = confusion_matrix(Y,y_pred)
conf_mat_norm = conf_mat / len(Y)
print (conf_mat_norm)



[[ 0.27604167  0.18854167]
 [ 0.175       0.36041667]]
