In [1]:
import sys
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# make sure to use position 1
sys.path.insert(1, "../")

## 0. Loading dataset, modules and defining constants

In [2]:
import dataset.dataset_reader as dr
dataset = dr.load_dataset()

In [3]:
BOTH = "both"
AGE_ONLY = "age"
GENDER_ONLY = "gender"

In [4]:
SVM_LINEAR = 'svm_linear.pkl'
SVM_LINEAR_FEATURES = 'svm_linear_features.pkl'
SVM_LINEAR_SCALED_PCA = 'svm_linear_scl_pca.pkl'
SVM_LINEAR_FEATURES_CHI2 = 'svm_linear_feat_chi2.pkl'
SVM_LINEAR_N_GRAMS = 'svm_linear_n_grams.pkl'
STACKING = 'stacking.pkl'
WORD_2_VEC = 'word2vec.pkl'
WORD_2_VEC_GOOGLE = 'word2vec_google.pkl'
WORD_2_VEC_SVM = 'word2vec_svm.pkl'
LOGISTIC_REGRESSION = 'logistic_regression.pkl'
LOGISTIC_REGRESSION_FEATURES = 'logistic_regression_feature.pkl'
GLOVE_FEAUTES = 'glove_features.pkl'

In [5]:
from validation.nested_k_fold import NestedKFoldValidation
from validation.k_fold import KFoldValidation
from metrics.standard_metrics import StandardMetrics
from metrics.metrics_map import MetricsMap
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2, SelectKBest
from metrics.t_test_metrics import TTestMetrics

## 1. SVM evaluation

### 1.1. SVM with linear kernel

In [6]:
from systems.simple_system import SimpleEvaluation


In [20]:
se = SimpleEvaluation() # Linear svm is default classifier
nkf = NestedKFoldValidation(param_grid=se.default_svm_get_param_grid(), random_state=42)
sm = StandardMetrics()
mm = MetricsMap()


In [21]:
mm.evaluate(dataset, se, nkf, BOTH)

0%  100%
[##   ] | ETA: 00:01:28

KeyboardInterrupt: 

In [28]:
mm.evaluate(dataset, se, nkf, AGE_ONLY)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:02:23


In [29]:
mm.evaluate(dataset, se, nkf, GENDER_ONLY)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:45


In [30]:
mm.save_map(SVM_LINEAR)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.71834,0.718447,0.71834,0.718303
both,0.326105,0.231609,0.207512,0.199005
age,0.486229,0.242697,0.262694,0.235315


### 1.2. SVM with linear kernel and additional features

In [31]:
se = SimpleEvaluation() # Linear svm is default classifier
sm = StandardMetrics()
mm = MetricsMap()
features = se.default_feature_set()

In [32]:
mm.evaluate(dataset, se, nkf, BOTH, additional_features=features)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:04:43


In [33]:
mm.evaluate(dataset, se, nkf, AGE_ONLY, additional_features=features)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:03:08


In [34]:
mm.evaluate(dataset, se, nkf, GENDER_ONLY, additional_features=features)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:55


In [35]:
mm.save_map(SVM_LINEAR_FEATURES)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.720507,0.721251,0.720507,0.720144
both,0.33256,0.269363,0.216314,0.214065
age,0.4977,0.273523,0.276205,0.257404


### 1.3 SVM with StandardScaler and PCA

In [19]:
se = SimpleEvaluation() # Linear svm is default classifier
sm = StandardMetrics()
mm = MetricsMap()
std_scaler = StandardScaler()
pca = PCA(n_components=2)

In [20]:
mm.evaluate(dataset, se, nkf, BOTH, scl=std_scaler, pca=pca)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:15:29


In [22]:
mm.evaluate(dataset, se, nkf, AGE_ONLY, scl=std_scaler, pca=pca)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:15:01


In [23]:
mm.evaluate(dataset, se, nkf, GENDER_ONLY, scl=std_scaler, pca=pca)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:13:34


In [24]:
mm.save_map(SVM_LINEAR_SCALED_PCA)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.516279,0.478848,0.516279,0.403737
both,0.199552,0.040204,0.105752,0.05055
age,0.410539,0.083444,0.196697,0.11717


### 1.4. SVM with linear kernel, additional features and chi2 reduction

In [16]:
from evaluation.eval_utils import get_documents_y
from sklearn.feature_extraction.text import TfidfVectorizer

def space_splitter(sentence):
    return sentence.split(" ")

f, y = get_documents_y(dataset, "both")
vectorizer = TfidfVectorizer(tokenizer=space_splitter)
vectorizer.fit(f)

f = vectorizer.transform(f)
print(f.todense().shape)
print(list(map(lambda x: 2 ** x, range(1, 15))))

(436, 252169)
[2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]


In [17]:
se = SimpleEvaluation() # Linear svm is default classifier
sm = StandardMetrics()
mm = MetricsMap()

N_FEATURES_OPTIONS = list(map(lambda x: 2 ** x, range(1, 15))) #[2, 4, 8]
C_OPTIONS =  list(map(lambda x: 2 ** x, range(-5, 5)))
param_grid = {
        'pca': [SelectKBest(chi2)],
        'pca__k': N_FEATURES_OPTIONS,
        'svc__C': C_OPTIONS
    }

nkf = NestedKFoldValidation(param_grid=param_grid)
features = SimpleEvaluation.default_feature_set()

In [18]:
mm.evaluate(dataset, se, nkf, BOTH, additional_features=features, pca=PCA())

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 01:33:47


In [60]:
mm.evaluate(dataset, se, nkf, AGE_ONLY, additional_features=features, pca=PCA())

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:18:49


In [20]:
mm.evaluate(dataset, se, nkf, GENDER_ONLY, additional_features=features, pca=PCA())

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 02:22:19


In [21]:
mm.save_map(SVM_LINEAR_FEATURES_CHI2)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
both,0.330648,0.252258,0.217244,0.215168
gender,0.697569,0.698353,0.697569,0.69724


### 1.5. SVM with linear kernel and n-grams


In [6]:
from systems.simple_system import SimpleEvaluation


In [14]:
se = SimpleEvaluation(n_gram_range=(1,2)) # Linear svm is default classifier
nkf = NestedKFoldValidation(param_grid=se.default_svm_get_param_grid(), random_state=42)
sm = StandardMetrics()
mm = MetricsMap()

additional_features = SimpleEvaluation.default_feature_set()


In [8]:
mm.evaluate(dataset, se, nkf, BOTH)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:29:08


In [28]:
mm.evaluate(dataset, se, nkf, AGE_ONLY)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:02:23


In [15]:
mm.evaluate(dataset, se, nkf, GENDER_ONLY, additional_features=additional_features)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:05:13


In [12]:
mm.save_map(SVM_LINEAR_N_GRAMS)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.722833,0.723108,0.722833,0.722733
both,0.325735,0.234006,0.206868,0.199055


## 2. Stacking


### 2. 1 Stacking default
* Gender: (Bayes logistic regression, Naive Bayes Multinomial, Naive Bayers, Linear SVM) -> meta: Naive Bayes

* Age: (Naive Bayes Multinomial, Simple logistics, Naive Bayes, LinearSVM) -> meta: Linear SVM

In [6]:
from systems.stacking_system import StackingEvaluation
from reduction.inform_gain_old import InformationGainOld

In [7]:
sc = StackingEvaluation()
kf = KFoldValidation(random_state=42)
sm = StandardMetrics()
mm = MetricsMap()

igo = sc.default_information_gain_reduce()

In [8]:
mm.evaluate(dataset, sc, kf, BOTH, reduction=igo)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:12:44


In [17]:
mm.evaluate(dataset, sc, kf, AGE_ONLY)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:08


In [8]:
mm.evaluate(dataset, sc, kf, GENDER_ONLY)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:10:36


In [9]:
mm.save_map(STACKING)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.725159,0.728124,0.725159,0.724306


## 3. Word2Vec

### 3.1. Word2Vec simple
* with words from dataset

In [6]:
from systems.word2vec_system import Word2VecEvaluation

In [7]:
w2v = Word2VecEvaluation()
kf = KFoldValidation(random_state=42)
sm = StandardMetrics()
mm = MetricsMap()

In [8]:
mm.evaluate(dataset, w2v, kf, BOTH)

Ucitao model
w2v


0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:01


In [9]:
mm.evaluate(dataset, w2v, kf, AGE_ONLY)

Ucitao model
w2v


0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:01


In [10]:
mm.evaluate(dataset, w2v, kf, GENDER_ONLY)

Ucitao model
w2v


0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:01


In [11]:
mm.save_map(WORD_2_VEC)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.674841,0.675605,0.674841,0.674463
both,0.259843,0.1652,0.166609,0.154781
age,0.401477,0.206138,0.216316,0.198811


### 3.2. Word2Vec with google matrix


In [10]:
google_matrix = 'samo_treba_lokaciju_staviti_umjesto_ovoga_i_to_je_to' #http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
pre_trained = ('google', google_matrix) # <-- Ime(prvi clan tuplea) moze biti bilo koje, to sluzi samo da spremi istrenirani model u konacnici

In [None]:
w2v = Word2VecEvaluation()
kf = KFoldValidation(random_state=42)
sm = StandardMetrics()
mm = MetricsMap()

In [None]:
mm.evaluate(dataset, w2v, kf, BOTH)

In [None]:
mm.evaluate(dataset, w2v, kf, AGE_ONLY)

In [None]:
mm.evaluate(dataset, w2v, kf, GENDER_ONLY)

In [None]:
mm.save_map(WORD_2_VEC_GOOGLE)
sm.show_result(mm)

### 3.3. Word2Vec with SVM

In [19]:
from sklearn.svm import LinearSVC

In [20]:
w2v = Word2VecEvaluation(clf=('svc', LinearSVC()))
kf = KFoldValidation(random_state=42)
sm = StandardMetrics()
mm = MetricsMap()

In [21]:
mm.evaluate(dataset, w2v, kf, BOTH)

Ucitao model
w2v


0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:03


In [22]:
mm.evaluate(dataset, w2v, kf, AGE_ONLY)

Ucitao model
w2v


0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:01


In [23]:
mm.evaluate(dataset, w2v, kf, GENDER_ONLY)

Ucitao model
w2v


0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:00


In [24]:
mm.save_map(WORD_2_VEC_SVM)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.665539,0.669232,0.665539,0.664379
both,0.275278,0.155367,0.163509,0.145437
age,0.444688,0.234982,0.239668,0.219333


## 4. Logistic regression

### 4.1. Logistic regression with parameters:

* C = 100.00
* regularization = L2

In [30]:
from systems.simple_system import SimpleEvaluation
from sklearn.linear_model import LogisticRegression
from validation.k_fold import KFoldValidation

In [31]:
clf = LogisticRegression()
se = SimpleEvaluation(clf=('clf', LogisticRegression(penalty='l2', random_state=42, C=100.0))) 

kf = KFoldValidation(random_state=42)
sm = StandardMetrics()
mm = MetricsMap()

In [32]:
mm.evaluate(dataset, se, kf, BOTH)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:20


In [33]:
mm.evaluate(dataset, se, kf, AGE_ONLY)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:10


In [34]:
mm.evaluate(dataset, se, kf, GENDER_ONLY)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:03


In [35]:
mm.save_map(LOGISTIC_REGRESSION)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.713848,0.713923,0.713848,0.713827
both,0.324041,0.239563,0.210068,0.204889
age,0.479411,0.254601,0.263618,0.243766


### 4. 2. Logistic regession with:
* additional features
* param grid

In [40]:
from systems.simple_system import SimpleEvaluation
from sklearn.linear_model import LogisticRegression
from validation.nested_k_fold import NestedKFoldValidation

In [43]:
clf = LogisticRegression()
param_grid = {'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
se = SimpleEvaluation(clf=('clf', LogisticRegression(penalty='l2', random_state=42))) 

kf = NestedKFoldValidation(param_grid=param_grid ,random_state=42)
sm = StandardMetrics()
mm = MetricsMap()

features = SimpleEvaluation.default_feature_set()

In [44]:
mm.evaluate(dataset, se, kf, BOTH, additional_features=features)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:03:46


In [45]:
mm.evaluate(dataset, se, kf, AGE_ONLY, additional_features=features)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:01:45


In [48]:
mm.evaluate(dataset, se, kf, GENDER_ONLY, additional_features=features)

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:30


In [35]:
mm.save_map(LOGISTIC_REGRESSION)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
gender,0.713848,0.713923,0.713848,0.713827
both,0.324041,0.239563,0.210068,0.204889
age,0.479411,0.254601,0.263618,0.243766


In [12]:
mm_svm = MetricsMap(path=SVM_LINEAR)
mm_lr = MetricsMap(path=SVM_LINEAR_FEATURES)

In [13]:
tt = TTestMetrics()

tt.show_result(metrics_map=(mm_svm, mm_lr))

Unnamed: 0,Statistics,p-value
age,-1.390635,0.236707
both,-1.720918,0.160378
gender,-0.191721,0.8573


## 5. GloVe with additional features

In [6]:
from features.glove_features import GloveFeatures
from systems.simple_system import SimpleEvaluation

glove = GloveFeatures()

In [7]:
se = SimpleEvaluation() # Linear svm is default classifier
sm = StandardMetrics()
nkf = NestedKFoldValidation(param_grid=se.default_svm_get_param_grid(), random_state=42)
mm = MetricsMap()
features = se.default_feature_set()

In [8]:
mm.evaluate(dataset, se, nkf, BOTH, additional_features=features, glove=glove)

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 421, expected 436.

In [16]:
mm.evaluate(dataset, se, nkf, AGE_ONLY, additional_features=features, glove=glove)

NameError: name 'features' is not defined

In [None]:
mm.evaluate(dataset, se, nkf, GENDER_ONLY, additional_features=features, glove=glove)

In [9]:
mm.save_map(GLOVE_FEAUTES)
sm.show_result(mm)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
both,0.279828,0.213042,0.189558,0.18464
