This Notebook is divided into 3 parts:
    1. n-grams and tf-idf language model implementation
    2. Random Forest model (baseline, tuning, best model evaluation)
    3. FastText embeddings

In [45]:
import pandas as pd
import numpy as np

# libraries for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Random Forest libraries
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

# score metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import classification_report
from sklearn import metrics

# fasttext libaries 
from gensim.models.fasttext import FastText
from gensim.models import FastText
from gensim.test.utils import common_texts
from gensim.test.utils import datapath
fb_path = datapath("wiki-news-300d-1M-subword.bin/wiki-news-300d-1M-subword.bin") 
from gensim.models.fasttext import load_facebook_vectors
from gensim.models.fasttext import load_facebook_model
import numpy as np

## Data set up

The project will use unigrams and bigrams from the heavily pre-processed data (lower case, removing stopwords, lemmanisation) and mildly pre-processed data (lower case, removing limited stopwords)

The classification will try to model the solution for 2 classes:
- positive ('Outstanding' and 'Good')
- negative ('Requires improvement' and 'Inadequate')



In [7]:
X_train = pd.read_csv('x_train.csv', index_col = 0)
Y_train = pd.read_csv('y_train.csv', index_col = 0)
X_test = pd.read_csv('x_val.csv', index_col = 0).sort_index()
Y_test = pd.read_csv('y_val.csv', index_col = 0).sort_index()

In [7]:
# df_test_idx = pd.read_csv('df_test_indices.csv', index_col = 0)

## ngram + tfidf
detecting best n features from the text

In [9]:
X_test[:1]

Unnamed: 0,full_text_limited_preprocess
2,finding part hospital community hospital provi...


In [10]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (2, 4)

# Limit on the number of features. We use the top 300 features.
TOP_K = 300

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams + 3-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens and list.
            'min_df': MIN_DOCUMENT_FREQUENCY,
            'max_features' : TOP_K
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)
    scores = vectorizer.get_feature_names()
    

#     scores = pd.DataFrame(list(zip(vectorizer.get_feature_names())),columns=['ftr'])

#  Alternatively, Select top 'k' of the vectorized features.
#     selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
#     selector.fit(x_train, train_labels)
#     scores = pd.DataFrame(list(zip(vectorizer.get_feature_names(), selector.scores_, selector.pvalues_)), 
#                                        columns=['ftr', 'score', 'pval'])
#     x_train = selector.transform(x_train).astype('float32')
#     x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val, scores


In [13]:
x_train, x_val, scores = ngram_vectorize(X_train['full_text_limited_preprocess'], Y_train['rating_overall'], X_test['full_text_limited_preprocess'])
x_train = x_train.toarray()
x_val = x_val.toarray()
y_train = Y_train.iloc[:, 0].values
y_test = Y_test.iloc[:, 0].values



In [14]:
len(x_train[0])

300

In [31]:
# Exporting features

# scores = pd.DataFrame(scores, columns = ['word'])
# scores.to_csv('rawTFIDF_feat2-4_top200.csv')

# scores.to_csv('selectKbestTFIDF_feat2-4_top200.csv')

# scores.sort_values(by=['score'], ascending = False)[:300].to_csv('top_300_features.csv')

## Random Forest

In [17]:
# RF first run
model = RandomForestClassifier(n_estimators=1000, class_weight = 'balanced')
model.fit(x_train, y_train)
print('Train score', model.score(x_train, y_train))
print('Test score', model.score(x_val, y_test))

y_pred = model.predict(x_val)
y_pred_proba = model.predict_proba(x_val)
y_pred_proba_list = []
for item in y_pred_proba:
    y_pred_proba_list.append(item[-1])
y_pred_proba = np.asarray(y_pred_proba_list)

Train score 1.0
Test score 0.8705035971223022


In [20]:
# evaluation
print(metrics.accuracy_score(y_test, np.where(y_pred_proba>0.5, 1,0)))

print(classification_report(y_test, np.where(y_pred_proba>0.5, 1,0)))

# Print confusion matrix using predictions
print(confusion_matrix(y_test, np.where(y_pred_proba>0.5, 1,0)))

0.8705035971223022
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       202
           1       0.87      0.62      0.72        76

    accuracy                           0.87       278
   macro avg       0.87      0.79      0.82       278
weighted avg       0.87      0.87      0.86       278

[[195   7]
 [ 29  47]]


In [40]:
# x_train[:3]

In [23]:
SEED = 5
base = RandomForestClassifier()
base.fit(x_train, y_train)


# Use feature selection to reduce number of columns to reduce unnecessary info.
sfm = SelectFromModel(base, threshold=0.0005, prefit=True)
X_t_train = sfm.transform(x_train)
X_t_test = sfm.transform(x_val)
# print(X.columns[sfm.get_support()]) # Print features selected



In [24]:
from sklearn.model_selection import cross_val_score

rf_model_1 = RandomForestClassifier(class_weight='balanced_subsample', random_state = 1)
print(rf_model_1.get_params())

{'bootstrap': True, 'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 'warn', 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}


## Tuning model

In [25]:
#Define a grid of hyperparameter 'params_'

param_grid_model_1 = {'n_estimators': [100, 200, 500, 1000], 
                      'max_features': ['auto', 'log2', "sqrt"],
                      "bootstrap"    : [True, False],  
                      'max_depth': [3, 10, 15, 20], 
                      'criterion': ['gini', 'entropy'],
                      'min_samples_leaf':[2, 5, 10] 
}


# Instantiate 'grid_'
grid_model_1 = GridSearchCV(estimator=rf_model_1,
                               param_grid=param_grid_model_1, 
                               cv=5,
                               scoring='f1',
                               verbose=1,
                               n_jobs=-1 
                              )

# Fit 'grid_' to the training set
grid_model_1.fit(X_t_train, y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 17.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True,
                                              class_weight='balanced_subsample',
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=N...
                                              oob_score=False, random_state=1,
                                              verbose=0, warm_start=False),
    

In [26]:
# Extract best hyperparameters from 'grid_rf'
best_hyperparams_model_1 = grid_model_1.best_params_

print('Best hyperparameters:\n', best_hyperparams_model_1)

# Extract best model from 'grid_svc'
best_model_1 = grid_model_1.best_estimator_

# cross-validate the model on train data
cv_results = cross_val_score(best_model_1, X_t_train, y_train, cv=10) # returns of array of cross-validation scores

y_pred_train = best_model_1.predict(X_t_train)
y_pred_train_probs = best_model_1.predict_proba(X_t_train)[:, 1]


#print('Best hyperparameters:\n',cv_results) # the array len == cv (the array represents R2)
print("Training Accuracy (cross validation): {:.4f} ".format(cv_results.mean()))
print(cv_results.std())

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print (classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))


#print(classification_report(y_true, y_pred, target_names=target_names))

Best hyperparameters:
 {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 10, 'n_estimators': 200}
Training Accuracy (cross validation): 0.9183 
0.05853977681273533
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       481
           1       0.91      1.00      0.95       168

    accuracy                           0.98       649
   macro avg       0.96      0.98      0.97       649
weighted avg       0.98      0.98      0.98       649

[[465  16]
 [  0 168]]


In [27]:
y_pred_proba = best_model_1.predict_proba(X_t_test)
y_pred_proba_list = []
for item in y_pred_proba:
    y_pred_proba_list.append(item[-1])
y_pred_proba = np.asarray(y_pred_proba_list)

In [28]:
# evaluation

print(metrics.accuracy_score(y_test, np.where(y_pred_proba>0.5, 1,0)))

print(classification_report(y_test, np.where(y_pred_proba>0.5, 1,0)))

# Print confusion matrix using predictions
print(confusion_matrix(y_test, np.where(y_pred_proba>0.5, 1,0)))

0.89568345323741
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       202
           1       0.81      0.82      0.81        76

    accuracy                           0.90       278
   macro avg       0.87      0.87      0.87       278
weighted avg       0.90      0.90      0.90       278

[[187  15]
 [ 14  62]]


## Save Predicitons

In [29]:
df_predictions = pd.read_csv('actual_and_predictions_tbl.csv', index_col = 0)
df_predictions[:3]

Unnamed: 0,index,organisationType,type,name,region,postalCode,onspdLatitude,onspdLongitude,rating_overall,reportDate,...,RF_300_24,CNN_ngram_20000,CNN_unigram_20000,CNN_unigram_20000_300,CNN_ngram_40000_200,LR_300_24,CNN_ngram_20000_300,CNN_ngram_40000_300,CNN_ngram_40000_200_FT,CNN_ngram_20000_300_FT
0,2,Location,NHS Healthcare Organisation,Amersham Hospital,South East,HP7 0JD,51.663005,-0.621408,Requires improvement,6/20/2014,...,1,1,1,1,1,1,1,1,1,0
1,4,Location,NHS Healthcare Organisation,"Community Healthcare Services, St Mary's Hospital",South East,PO30 5TG,50.710843,-1.30133,Requires improvement,9/9/2014,...,0,1,1,0,1,0,0,1,0,0
2,6,Location,NHS Healthcare Organisation,Cossham Hospital,South West,BS15 1LF,51.468887,-2.51612,Good,2/11/2015,...,0,0,0,0,0,0,0,0,0,0


In [164]:
# df_predictions = df_predictions.drop('CNN_vanilla_20000', axis = 1)

# df_predictions['y_actual_rf'] = y_test

# df_predictions[:2]

# df_predictions['RF_300_24'] = np.where(y_pred_proba>0.5, 1,0)

# df_predictions.to_csv('actual_and_predictions_tbl.csv')

## Save Welsh Predictions

In [30]:
df_Welsh = pd.read_csv('Welsh_documents_df_revised_tableau_v1.csv', index_col = 0).reset_index()

In [31]:
df_Welsh = df_Welsh.drop('index', axis = 1)

In [40]:
df_Welsh.columns.values

array(['level_0', 'Type of establishment', 'Services provided',
       'Address line 1', 'Address line 2', 'Postcode',
       'Last inspection date', 'report_url', 'filename_report',
       'doc_index_first_line', 'doc_details', 'doc_index_last_line',
       'full_text', 'word_count', 'sentence_count', 'distinct_word_count',
       'avg_word_len', 'full_text_preprocess', 'start',
       'full_text_limited', 'full_text_limited_preprocess',
       'full_text_limited_preprocess_list', 'tokens_300', 'tokens_500',
       'tokens_1000', 'tokens_2000', 'tokens_5000',
       'full_text_limited_nlpprocess',
       'full_text_limited_nlpprocess_list', 'tokens_500_nlpprocess',
       'tokens_1000_nlpprocess', 'tokens_2000_nlpprocess',
       'tokens_5000_nlpprocess', 'tokens_10000_nlpprocess',
       'tokens_full_text_limited_doc2vec', 'full_text_nlpprocess',
       'word_count_nlpprocess', 'count_not', 'prop_not', 'prop_not_2',
       'bigram_full_text_preprocess', 'RF_300_24', 'human_check',
  

In [33]:
# ngram + tfidf

x_train, x_Welsh, scores = ngram_vectorize(X_train['full_text_limited_preprocess'], 
                                    Y_train['rating_overall'], df_Welsh['full_text_limited_preprocess'])
x_Welsh = x_Welsh.toarray()



In [34]:
# check features len
len(x_Welsh[0])

300

In [35]:
# transform features 
X_t_Welsh = sfm.transform(x_Welsh)

In [36]:
# predictions with the rf best model
y_pred_proba_Welsh = best_model_1.predict_proba(X_t_Welsh)
y_pred_proba_list = []
for item in y_pred_proba_Welsh:
    y_pred_proba_list.append(item[-1])
y_pred_proba_Welsh = np.asarray(y_pred_proba_list)

In [38]:
np.where(y_pred_proba_Welsh>0.5,1,0)

array([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [204]:
# Welsh_predictions = pd.DataFrame(np.where(y_pred_proba_Welsh>0.5,1,0), columns =['RF_300_24'])

## FastText word-embeddings
Instead of n-grams we can use fasttext word-embeddings either pre-trained by fb or pre-trained from our dataset

In [41]:
from gensim.models.fasttext import FastText

In [46]:
# loading previously pre-trained vectors on CQC data
fb_model_mine = FastText.load('fasttext_full_text_limited_nlpprocess_list.model')

In [47]:
# function to create word-embeddings, average word-embedding 
# the model name needs to be either the loaded fb vectors or fb trained on trained dataset vectors

def get_feature_vector_ft(data_column):
    index2word_set = set(fb_model_mine.wv.vocab.keys())  # words known to model
    featureVec = np.zeros(fb_model_mine.vector_size, dtype="float32")
    featureVec 
    # Initialize a counter for number of words in a review
    nwords = 0
    # Loop over each word in the comment and, if it is in the model's vocabulary, add its feature vector to the total
    for word in data_column:
        if word in index2word_set: 
            featureVec = np.add(featureVec, fb_model_mine[word])
            nwords += 1.

    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    #print(featureVec)
    return featureVec
#train_data["tweets_sent"]

In [48]:
# transforming train and test datasets

X_train["feature_vec_ft_soft_clean"] =X_train["full_text_limited_preprocess"].apply(get_feature_vector_ft)
X_train_ft_soft_clean = np.array(list(map(np.array, X_train.feature_vec_ft_soft_clean)))

X_test["feature_vec_ft_soft_clean"] =X_test["full_text_limited_preprocess"].apply(get_feature_vector_ft)
X_test_ft_soft_clean = np.array(list(map(np.array, X_test.feature_vec_ft_soft_clean)))

print(X_train["feature_vec_ft_soft_clean"].shape)
print(X_test["feature_vec_ft_soft_clean"].shape)

y_train = Y_train.iloc[:, 0].values
y_test = Y_test.iloc[:, 0].values

x_train = np.array(X_train['feature_vec_ft_soft_clean'].tolist())
x_val = np.array(X_test['feature_vec_ft_soft_clean'].tolist())

  if sys.path[0] == '':


(649,)
(278,)


In [49]:
from sklearn.feature_selection import SelectFromModel
SEED = 5
base = RandomForestClassifier()
base.fit(x_train, y_train)


# Use feature selection to reduce number of columns to reduce unnecessary info.
sfm = SelectFromModel(base, threshold=0.0005, prefit=True)
X_t_train = sfm.transform(x_train)
X_t_test = sfm.transform(x_val)
# print(X.columns[sfm.get_support()]) # Print features selected

from sklearn.model_selection import cross_val_score

rf_model_1 = RandomForestClassifier(class_weight='balanced_subsample', random_state = 1)
print(rf_model_1.get_params())

from sklearn.model_selection import GridSearchCV
#Define a grid of hyperparameter 'params_'
param_grid_model_1 = {'n_estimators': [100, 200, 500, 1000], 
                      'max_features': ['auto', 'log2', "sqrt"],
                      "bootstrap"    : [True, False],  
                      'max_depth': [3, 10, 15, 20], 
                      'criterion': ['gini', 'entropy'],
                      'min_samples_leaf':[2, 5, 10] 
}


# Instantiate 'grid_'
grid_model_1 = GridSearchCV(estimator=rf_model_1,
                               param_grid=param_grid_model_1, 
                               cv=5,
                               scoring='f1',
                               verbose=1,
                               n_jobs=-1 
                              )

# Fit 'grid_' to the training set
grid_model_1.fit(X_t_train, y_train)

# Extract best hyperparameters from 'grid_rf'
best_hyperparams_model_1 = grid_model_1.best_params_

print('Best hyperparameters:\n', best_hyperparams_model_1)

# Extract best model from 'grid_svc'
best_model_1 = grid_model_1.best_estimator_

# cross-validate the model on train data
cv_results = cross_val_score(best_model_1, X_t_train, y_train, cv=10) # returns of array of cross-validation scores

y_pred_train = best_model_1.predict(X_t_train)
y_pred_train_probs = best_model_1.predict_proba(X_t_train)[:, 1]


#print('Best hyperparameters:\n',cv_results) # the array len == cv (the array represents R2)
print("Training Accuracy (cross validation): {:.4f} ".format(cv_results.mean()))
print("Training sd (cross validation:  {:.4f} ".format(cv_results.std()))

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print (classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


{'bootstrap': True, 'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 'warn', 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 22.7min finished


Best hyperparameters:
 {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 10, 'n_estimators': 200}
Training Accuracy (cross validation): 0.9167 
Training sd (cross validation:  0.0578 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99       481
           1       0.92      1.00      0.96       168

    accuracy                           0.98       649
   macro avg       0.96      0.99      0.97       649
weighted avg       0.98      0.98      0.98       649

[[467  14]
 [  0 168]]


In [51]:
y_pred_proba = best_model_1.predict_proba(X_t_test)
y_pred_proba_list = []
for item in y_pred_proba:
    y_pred_proba_list.append(item[-1])
y_pred_proba = np.asarray(y_pred_proba_list)

print("Test Accuracy: {:.4f} ".format(metrics.accuracy_score(y_test, np.where(y_pred_proba>0.5, 1,0))))

print(classification_report(y_test, np.where(y_pred_proba>0.5, 1,0)))

# Print confusion matrix using predictions
print(confusion_matrix(y_test, np.where(y_pred_proba>0.5, 1,0)))

Test Accuracy: 0.8885 
              precision    recall  f1-score   support

           0       0.93      0.92      0.92       202
           1       0.79      0.80      0.80        76

    accuracy                           0.89       278
   macro avg       0.86      0.86      0.86       278
weighted avg       0.89      0.89      0.89       278

[[186  16]
 [ 15  61]]


In [None]:
# files: 
# 1. fb file
# x_train, x_test, y_train, y_test