## Import Libraries and Modules

In [17]:
!pip install snorkel
!pip install textblob
import io
import pandas as pd
#Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
#NLP packages
import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


In [None]:
#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Loading Dataset into the NB

In [3]:
uploaded = '/Users/naisheel/Desktop/Python/Capstone Project/Google Storage Archive/train.csv'
df = pd.read_csv(uploaded)

df = df.rename(columns = {'Description': 'text'})
df['text'] = df['text'].astype(str)

#about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Index  120000 non-null  int64 
 1   Title        120000 non-null  object
 2   text         120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


## Dataset Preprocessing

In [4]:
Pos = 1
Neg = 0
Neutral = -1

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return Neutral

def make_keyword_lf(keywords, label=Pos):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))
keyword_positive = make_keyword_lf(keywords=['boosts', 'great', 'develops', 'promising', 'amazing', 'speelbounding', 'ambitious', 'delighted', 'record', 'win', 'breakthrough', 'recover', 'achievement', 'peace', 'party', 'hope', 'flourish', 'respect', 'partnership', 'champion', 'positive', 'happy', 'bright', 'confident', 'encouraged', 'perfect', 'complete', 'assured' ])
keyword_negative = make_keyword_lf(keywords=['war','solidiers', 'conflict', 'turmoil', 'injur','trouble', 'aggressive', 'killed', 'coup', 'evasion', 'strike', 'troops', 'dismisses', 'attacks', 'defeat', 'damage', 'dishonest', 'dead', 'fear', 'foul', 'fails', 'hostile', 'cuts', 'accusations', 'victims',  'death', 'unrest', 'fraud', 'dispute', 'destruction', 'battle', 'unhappy', 'bad', 'alarming', 'angry', 'anxious', 'dirty', 'pain', 'poison', 'unfair', 'unhealthy'
                                              ], label=Neg) # manually adding some pos/neg keywords to boost labelling

In [5]:
# fetching polarising sentiments
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

In [6]:
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return Pos if x.polarity > 0.6 else Neutral

In [7]:
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return Pos if x.subjectivity >= 0.5 else Neutral

In [8]:
# df = df.iloc[:50000] # Subset of train dataset for faster results due to limitation in hardware resources (Final result, however would be a model trained on the whole dataset)

In [54]:
#combine all the labeling functions
lfs = [keyword_positive, keyword_negative, textblob_polarity, textblob_subjectivity ]
#implement the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)
#implement the label model
label_model = LabelModel(cardinality=2, verbose=True)
#fit on the data
label_model.fit(L_snorkel)
#predicting and labelling
df["label"] = label_model.predict(L=L_snorkel)

100%|██████████| 27665/27665 [00:19<00:00, 1442.95it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.479]
INFO:root:[10 epochs]: TRAIN:[loss=0.168]
INFO:root:[20 epochs]: TRAIN:[loss=0.025]
INFO:root:[30 epochs]: TRAIN:[loss=0.039]
INFO:root:[40 epochs]: TRAIN:[loss=0.018]
INFO:root:[50 epochs]: TRAIN:[loss=0.020]
INFO:root:[60 epochs]: TRAIN:[loss=0.017]
INFO:root:[70 epochs]: TRAIN:[loss=0.017]
INFO:root:[80 epochs]: TRAIN:[loss=0.016]
INFO:root:[90 epochs]: TRAIN:[loss=0.016]
100%|██████████| 100/100 [00:00<00:00, 1086.00epoch/s]
INFO:root:Finished Training


In [55]:
#Filtering out unlabeled data points
df= df.loc[df.label.isin([0,1]), :]
#find the label counts
df['label'].value_counts()

label
1    16461
0    11204
Name: count, dtype: int64

In [56]:
#make a copy of the dataframe
data = df.copy()
#define a function which handles the text preprocessing
def text_data_prep(data):
    """
    This pipeline processes our data by executing the following steps:
    Tokenization, Lemmatization, Removing stopwords, & removing punctuation
    """
    # initialize spacy object
    nlp = spacy.load('en_core_web_sm')
    # select raw text
    raw_text = data.text.values.tolist()
    # tokenize
    tokenized_text = [[nlp(i.lower().strip())] for i in tqdm(raw_text)]
    #define the punctuations and stop words
    punc = string.punctuation
    stop_words = set(stopwords.words('english'))
    #lemmatize, remove stopwords and punctuationd
    corpus = []
    for doc in tqdm(tokenized_text):
        corpus.append([word.lemma_ for word in doc[0] if (word.lemma_ not in stop_words and word.lemma_ not in punc)])
    # add prepared data to df
    data["text"] = corpus
    return data
#apply the data preprocessing function
data =  text_data_prep(data)

100%|██████████| 27665/27665 [06:09<00:00, 74.93it/s]
100%|██████████| 27665/27665 [00:01<00:00, 22623.26it/s]


In [57]:
def text_representation(data):
  tfidf_vect = TfidfVectorizer()
  data['text'] = data['text'].apply(lambda text: " ".join(set(text)))
  X_tfidf = tfidf_vect.fit_transform(data['text'])
  print(X_tfidf.shape)
  print(tfidf_vect.get_feature_names_out())
  X_tfidf = pd.DataFrame(X_tfidf.toarray())
  return X_tfidf
#apply the TFIDV function
X_tfidf = text_representation(data)

(27665, 26854)
['00' '000' '000metres' ... 'zwick' 'zyman' 'zz']


## Training (Supervised Learning)

In [58]:
X= X_tfidf
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#fit Log Regression Model
clf= LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.80      0.87      3676
           1       0.88      0.97      0.92      5454

    accuracy                           0.90      9130
   macro avg       0.91      0.88      0.89      9130
weighted avg       0.90      0.90      0.90      9130



### Testing our model on a new text

## Saving the model

In [67]:
model.save('model_path.h5') # saves model as new executable file to analyse new data

  saving_api.save_model(


## Downloading Preprocessed Dataset

In [18]:
df.to_csv('pre-processed-dataset', sep=',', index=False, encoding='utf-8')