#### Downloading required libraries

In [1]:
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install -U pandas
!pip install -U scikit-learn

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 1.3 MB/s  eta 0:00:01


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


#### Loading required libraries

In [2]:
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
def train_logistic_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = LogisticRegression().fit(X_train, y_train)
    return model


def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    
def prepare_length_feature(pa_text, not_pa_text):
    pa_lengths = [len(i) for i in pa_text]
    not_pa_lengths = [len(i) for i in not_pa_text]

    pa_labels = [1]*len(pa_lengths)
    not_pa_labels = [0]*len(not_pa_lengths)

    all_features = pa_lengths + not_pa_lengths
    all_labels = pa_labels + not_pa_labels

    merged_list = [list(x) for x in zip(all_features, all_labels)]
    df = pd.DataFrame(merged_list, columns =['feature', 'label'])
    df = shuffle(df)
    features = df.feature.values.reshape(-1,1)
    labels = df.label
    return features, labels

## Exploratory Analysis

#### Reading the dataset

In [4]:
Corpus = pd.read_csv("train.csv",delimiter=",",encoding='utf-8', index_col=False)
test_csv = pd.read_csv("test.csv",delimiter=",",encoding='utf-8', index_col=False)
valid_csv = pd.read_csv("valid.csv",delimiter=",",encoding='utf-8', index_col=False)

#### Check the number of rows and columns

In [5]:
Corpus.shape

(250000, 3)

#### Display top 5 rows

In [6]:
Corpus.head()

Unnamed: 0,ParagraphType,DocumentId,Text
0,TI,13049012-Glider--aircraft,Glider
1,TI,13049012-Glider--aircraft,Aircraft
2,AB,13049012-Glider--aircraft,A glider is a heavier-than-air aircraft that i...
3,AB,13049012-Glider--aircraft,There is a wide variety of types differing in ...
4,H1,13049012-Glider--aircraft,Etymology


## 1.a How many different paragraph types are there?

In [7]:
len(pd.unique(Corpus['ParagraphType']))

10

## 1.b What is the frequency of each type?

In [8]:
Corpus['ParagraphType'].value_counts()

PA    126792
LI     43704
H1     42250
H2     17848
TI      8711
BY      4210
AB      3188
H3      2580
HA       640
CO        77
Name: ParagraphType, dtype: int64

## 1.c What is the most common word in ‘H1’ paragraph type?

In [9]:

import statistics
from statistics import mode

h1_col = Corpus[Corpus["ParagraphType"] == "H1"]
all_h1_text = h1_col["Text"].tolist()

pa_col = Corpus[Corpus["ParagraphType"] == "PA"]
all_pa_text = pa_col["Text"].tolist()

words_counts = [wrd for sub in all_h1_text for wrd in sub.split()]
mode(words_counts)


'References'

## 1.d What is the median length of ‘H1’ and ‘PA’ paragraph types?

In [10]:
all_h1_lengths = [len(i) for i in all_h1_text]
statistics.median(all_h1_lengths)

10.0

In [11]:
all_pa_lengths = [len(i) for i in all_pa_text]
statistics.median(all_pa_lengths)

329.0

## PA, non PA classifier

## 2.a What is the frequency of PA and non-PA types?

In [12]:
pa_col = Corpus[Corpus["ParagraphType"] == "PA"]
non_pa_col = Corpus[Corpus["ParagraphType"] != "PA"]

pa_text = pa_col["Text"].tolist()
not_pa_text = non_pa_col["Text"].tolist()

print("PA = ",len(pa_col))
print("NON PA = ",len(non_pa_col))

PA =  126792
NON PA =  123208


## 2.b build a simple binary classification model based on length

In [13]:

train_only_length_feature, train_only_length_label = prepare_length_feature(pa_text, not_pa_text)

logisticRegr = LogisticRegression()
logisticRegr.fit(train_only_length_feature, train_only_length_label)


LogisticRegression()

## 2.c Evaluation metrics will you use to evaluate the prediction quality

I will use F1 scores to show the prediction quality as both accuracy and recall are important in this case. 
F1 is a good measure of overall quality.

## 2.d Report values of these metrics on your validation dataset

In [14]:
valid_pa_col = valid_csv[valid_csv["ParagraphType"] == "PA"]
valid_non_pa_col = valid_csv[valid_csv["ParagraphType"] != "PA"]
valid_pa_text = valid_pa_col["Text"].tolist()
valid_not_pa_text = valid_non_pa_col["Text"].tolist()

valid_only_length_feature, valid_only_length_label = prepare_length_feature(valid_pa_text, valid_not_pa_text)

predicted = logisticRegr.predict(valid_only_length_feature)
print_evaluation_scores(valid_only_length_label, predicted)



Accuracy:  0.8671348258407858
F1-score macro:  0.866739611515037
F1-score micro:  0.8671348258407858
F1-score weighted:  0.8666134391237528


## 2.e Build a model that uses information from the paragraphtext only

### First tried tf-idf based approach

In [15]:
from sklearn.model_selection import train_test_split
train_X, train_y = Corpus['Text'],(Corpus['ParagraphType'] == "PA").astype(int)
valid_X, valid_y = valid_csv['Text'],(valid_csv['ParagraphType'] == "PA").astype(int)
test_X, test_y = test_csv['Text'],(test_csv['ParagraphType'] == "PA").astype(int)

#### Calculate TF-TDF
####  Term Frequency: This summarizes how often a given word appears within a document
#### Inverse Document Frequency: This down scales words that appear a lot across documents

In [16]:
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = list(STOP_WORDS)
# Extending stop words list
stop_words = list(stop_words) + ['aa','aah','da','lar','ok']

Tfidf_vect = TfidfVectorizer(stop_words=stop_words,max_df=0.8)

Tfidf_vect.fit(train_X)

train_X_Tfidf = Tfidf_vect.transform(train_X)
valid_X_Tfidf = Tfidf_vect.transform(valid_X)
test_X_Tfidf = Tfidf_vect.transform(test_X)

In [17]:
# Coverting to dense matrix and putting in a dataframe to view the Tfidf matrix
Dense_mat = train_X_Tfidf.todense()
Tfidf_Mat = pd.DataFrame(Dense_mat, columns=Tfidf_vect.get_feature_names())

#### Use the ML Algorithms to Predict the label

#### Implementing Logistic Regression

In [18]:
logisticRegr = LogisticRegression()
logisticRegr.fit(train_X_Tfidf,train_y)

LogisticRegression()

## 2.f Report the prediction quality of your model on validation dataset

In [19]:
valid_pred = logisticRegr.predict(valid_X_Tfidf)
print_evaluation_scores(valid_y, valid_pred)

# test_pred = logisticRegr.predict(test_X_Tfidf)
# print_evaluation_scores(test_y, test_pred)

Accuracy:  0.8846007642598483
F1-score macro:  0.8845890763124387
F1-score micro:  0.8846007642598483
F1-score weighted:  0.8845688838148266
Accuracy:  0.4954864593781344
F1-score macro:  0.3313212608987257
F1-score micro:  0.4954864593781344
F1-score weighted:  0.6626425217974514


### Also tried Bag of Words

In [21]:

# from collections import Counter
# from scipy import sparse as sp_sparse
# words_counts = Counter(words_counts)
# DICT_SIZE = 5000
# POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
# WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
# INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
# ALL_WORDS = WORDS_TO_INDEX.keys()

# def my_bag_of_words(text, words_to_index, dict_size):
#     """
#         text: a string
#         dict_size: size of the dictionary
        
#         return a vector which is a bag-of-words representation of 'text'
#     """
#     result_vector = np.zeros(dict_size)
#     for word in text.split(' '):
#         if word in words_to_index:
#             result_vector[words_to_index[word]] +=1
#     return result_vector

# X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in train_X])
# X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in valid_X])
# X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in test_X])


# print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape)

# classifier_mybag = train_logistic_classifier(X_train_mybag, train_y, C = 4, regularisation = 'l2')
# y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
# print_evaluation_scores(valid_y, y_val_predicted_labels_mybag)

# y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)
# print_evaluation_scores(test_y, y_test_predicted_labels_mybag)




X_train shape  (250000, 5000) 
X_val shape  (49983, 5000)
Accuracy:  0.8865414240841887
F1-score macro:  0.8863509954481943
F1-score micro:  0.8865414240841887
F1-score weighted:  0.8862701144683903
Accuracy:  0.45456369107321964
F1-score macro:  0.31250861950075853
F1-score micro:  0.45456369107321964
F1-score weighted:  0.6250172390015171
