**Members of the group:**

Antoniello Antonia

Casale Teresa

Cerino Mario

Palladino Amedeo

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/My Drive/Machine Learning/Exercises2019/Exercise')

We import all the libraries we need for our classifiers.

In [None]:
from nltk.stem.porter import PorterStemmer
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import h5py
import pickle
from nltk.stem import SnowballStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import nltk
from nltk.corpus import stopwords
nltk.download('punkt');
from textblob import TextBlob
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

The following function takes in input a 1D vector and in output returns a 2D vector.

In [None]:
def to2D(x):
    return np.reshape(x,(-1,1))

The following class is used to extract features in a format supported by machine learning algorithms from textual datasets.

In [None]:
class FeatureExtractorText:
        
        
    def fit(self, X, y):
        features = self._feature_extraction(X)
        container._features = features
        return self
    
    def transform(self, X):
        self.fit(X,None)
        return X

    
    def _feature_extraction(self, X):
        corpus = X
        doc_word_count = np.array([len(row)  for row in corpus])
        doc_char_count = np.array([ np.array([ len(word) for word in row]).sum()  for row in corpus])
        word_density = doc_char_count/(doc_word_count + 1)
        f1 = np.concatenate((to2D(doc_word_count), to2D(doc_char_count)), axis=1)
        features = np.concatenate((f1,to2D(word_density)), axis=1)
        return features
                

The following class is used to join the vectorized data with the container features. In order to work, Feature Extractor requires a set of non-vectorized documents. After extracting the features, we save them in a container for later use in the pipeline; in particular, the joiner takes the features from the container and joins them to the features processed by Tfidf.

In [None]:
class Joiner:
    
      
    def fit(self, X, y):
        self.X_ = X.toarray()
        self.features = container._features
        return self
    
    def transform(self, X):
        self_new = self.fit(X,None)
        X_, features = self_new.X_, self_new.features
        X_tr = np.concatenate((self.X_,self.features), axis=1)

        return X_tr


The following class is used as intermediate between the extractor and the joiner objects in the pipeline structure.

In [None]:
class FeatureContainer:
    
    
    def __init__(self):
        self._features = None
        
    
    def get_features(self):
        
        #sinchronized method
        with threading.Lock():
            return self._features
    
    def set_features(self, features):
        

        with threading.Lock():
            self._features = features
    
    

Preprocessing phase: it provides to remove useless word from the text and to reduce the words to their root (prefix).

In [None]:
#method used for text splitting
def tokenizer(text):
    return text.split(text)

In [None]:
stop = stopwords.words('english')
porter=PorterStemmer()
def tokenizer_porter(text):
    return[porter.stem(word) for word in text.split()]

In [None]:
porter=SnowballStemmer('english')
def tokenizer_snowball(text):
    return[porter.stem(word) for word in text.split()]

Loading of the data set.

In [None]:
f = h5py.File('data.h5', 'r')
X_training = f['X_training'][:]
y_training = f['y_training'][:]
X_test = f['X_test'][:]
y_test = f['y_test'][:]
f.close()

In [None]:
with open('feature.names','rb') as f:
    vector = pickle.load(f)

Function used to convert vectorized data into documents

In [None]:
def retrieve_text(X_, vector):
    X_training_str = []
    for doc in X_:
        str_doc = ""
        for i,item in enumerate(doc):
            if item>0:
                str_doc += (vector[i]+" ")
        
        X_training_str.append(str_doc)
    
    return X_training_str
          

We split the data set in training set and test set.

In [None]:
X_training, X_test = retrieve_text(X_training, vector), retrieve_text(X_test, vector)
X_training = np.array(X_training)
X_test = np.array(X_test)

Pipeline use for all the following classifiers.

The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.

## **Logistic Regression**

In [None]:
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None,
                       ngram_range=(1,2),
                       stop_words=stop,
                       tokenizer=tokenizer_snowball,
                      
                       )
param_grid = [
             {
                 'clf__C':[0.2,0.5, 1, 5,8, 10, 15, 20, 30, 40, 70, 100, 1000]
              }]

container = FeatureContainer()
extractor = FeatureExtractorText()
joiner = Joiner()

lr_tfidf = Pipeline([('extractor', extractor),
                     ('vect', tfidf),
                     ('joiner', joiner),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=10)
gs_lr_tfidf.fit(X_training, y_training)


#Save the Classifier
f=open('logistic.model', 'wb')
pickle.dump(gs_lr_tfidf, f)
f.close()

print('Training set score =', gs_lr_tfidf.score(X_training, y_training))
print('Test set score =', gs_lr_tfidf.score(X_test, y_test))

Fitting 5 folds for each of 13 candidates, totalling 65 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   29.9s
[Parallel(n_jobs=10)]: Done  65 out of  65 | elapsed:   52.8s finished
  'stop_words.' % sorted(inconsistent))
  y = column_or_1d(y, warn=True)


Training set score = 0.9965004374453194
Test set score = 0.8765306122448979


## **Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
tfidf = CountVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None,
                       ngram_range=(1,2),
                       tokenizer=tokenizer_snowball)

#parameters setting for NB classifier:
param_grid = [{
               'clf__alpha':[0,0.4,0.7,0.8,1,5]
              }]

#definition of the components to insert in the pipeline
container = FeatureContainer()
extractor = FeatureExtractorText()
joiner = Joiner()

#pipeline definition
lr_tfidf3 = Pipeline([
                      ('vect', tfidf),
                      ('clf',MultinomialNB())])

gs_lr_tfidf3 = GridSearchCV(lr_tfidf3, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=8)
multinomial_clf = gs_lr_tfidf3.fit(X_training, y_training)

#Save the Classifier
f=open('nbclassifier.model', 'wb')
pickle.dump(multinomial_clf , f)
f.close()

print('Training set score =', multinomial_clf.score(X_training, y_training))
print('Test set score =', multinomial_clf.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed:   18.1s finished
  y = column_or_1d(y, warn=True)


Training set score = 0.9881889763779528
Test set score = 0.8438775510204082


## **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None,
                       ngram_range=(1,2),
                       stop_words=None,
                       tokenizer=tokenizer_snowball,
                       norm=None,
                       use_idf=True)

#parameters setting for Decision Tree classifier:
param_grid = [{
               'clf__max_depth':[5],
                'clf__max_leaf_nodes':[13]      

              }]

#definition of the components to insert in the pipeline
container = FeatureContainer()
extractor = FeatureExtractorText()
joiner = Joiner()

#pipeline definition
lr_tfidf2 = Pipeline([('extractor', extractor),
                      ('vect', tfidf),
                      ('joiner', joiner),
                      ('clf', DecisionTreeClassifier(random_state=0))])
gs_lr_tfidf2 = GridSearchCV(lr_tfidf2, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=8)

gs_lr_tfidf2.fit(X_training, y_training)
gs_lr_tfidf2.score(X_test, y_test)
decision_tree = gs_lr_tfidf2

#Save the Classifier
f=open('decision_tree.model', 'wb')
pickle.dump(decision_tree, f)
f.close()


print('Training set score=',decision_tree.score(X_training, y_training))
print('Test set score=',decision_tree.score(X_test, y_test))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:   19.8s remaining:   29.7s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:   20.5s finished


Training set score= 0.8613298337707787
Test set score= 0.863265306122449


## **Multiclassification combiner with weighted voting**

In [None]:
class Combiner:
    
    def __init__(self, classifiers,):
        self.classifiers_ = classifiers
        
    #computation of reliability vectors for each classifier
    def compute_reliability(self, X, y):
        X_0 = X[y.ravel()==0]
        X_1 = X[y.ravel() == 1]
        n_samples_0 = X_0.shape[0]
        n_samples_1 = X_1.shape[0]
        reliability_dict = {}
        
        for clf in self.classifiers_:
            
            y_pred_0 = clf.predict(X_0.ravel())
            correct_0 = y_pred_0[y_pred_0==0].shape[0]
            reliability_0 = correct_0/n_samples_0
            
            y_pred_1 = clf.predict(X_1.ravel())
            correct_1 = y_pred_1[y_pred_1==1].shape[0]
            reliability_1 = correct_1/n_samples_1
            
            reliability_dict[clf] = [ reliability_0,  reliability_1]
            
        self.rel_dict = reliability_dict
        
        return reliability_dict
      
 
      
    def compute_proba_predictions(self, X_test, Y_test):
        
        y = []
        y_=[]
        
        for clf in self.classifiers_:
          y_.append(clf.predict_proba(X_test).astype(np.int32))
          
        
        
        y_ = np.moveaxis(y_,(0,1,2), (1,0,2))
        rel_dict = self.rel_dict
        
        for sample in y_:
          vec = dict({0:0, 1:0})
          for i,clf in enumerate(sample):
            #clf[0] è il risultato della classe 0 del corrente classifier
            vec[0] += clf[0]*self.rel_dict[self.classifiers_[i]][0]
            vec[1] += clf[1]*self.rel_dict[self.classifiers_[i]][1]
            
          if vec[0] > vec[1]:
              y.append(0)
          else:
              y.append(1)
        return y


    def compute_predictions(self, X_test, Y_test):
        
        y = []
        y_=[]
        
        for clf in self.classifiers_:
          y_.append(clf.predict(X_test).astype(np.int32))
          
        
        
        y_ = np.array(y_).T
        rel_dict = self.rel_dict
        
        for y_el in y_:
          
          vec = dict({0:0, 1:0})

          
          for index,i in enumerate(y_el):
            

            if vec[i]==0:
                vec[i] = rel_dict[self.classifiers_[index]][i]
            else:
                vec[i]+=rel_dict[self.classifiers_[index]][i]

          if vec[0] > vec[1]:
              y.append(0)
          else:
              y.append(1)
              
        return y

      
    def score(self,y_comb, y_test): 
          y_comb = np.array(y_comb)
          return 1-(np.abs(y_test.ravel() - y_comb).sum()/y_test.shape[0])
          
        
      
    

In [None]:
#Data set division between training set (70%) and test set (30%).
X_training_1, X_test_1, y_training_1, y_test_1 = train_test_split(X_training, y_training, test_size=0.3, shuffle=True, random_state=11)

#definition of the involved classifiers
estimators = [gs_lr_tfidf,decision_tree, multinomial_clf]

combiner = Combiner(estimators)

#definition of the reliability vectors
combiner.compute_reliability(X_training_1, y_training_1)

#combiner execution
X_comb=combiner.compute_predictions(X_training,y_training)
y_comb=combiner.compute_predictions(X_test,y_test)

#Save the Combiner
comb = np.array([combiner.score(X_comb,y_training),combiner.score(y_comb,y_test)])
f=open('combiner.model', 'wb')
pickle.dump(comb, f)
f.close()

#Final performance evaluation
print('Multiclassification Combiner score on training set =',combiner.score(X_comb,y_training))
print('Multiclassification Combiner score on test set =',combiner.score(y_comb,y_test))

Multiclassification Combiner score on training set = 0.994750656167979
Multiclassification Combiner score on test set = 0.9
