In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pandas as pd
#from implementations import *
#from split_data import * 
%load_ext autoreload
%autoreload 2


In [2]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Main external library : Natural Language Toolkit (nltk)
import nltk
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load tweets

In [4]:
DATA_FOLDER = "data/"

POSITIVE_DATASET = DATA_FOLDER+"train_pos.txt"
NEGATIVE_DATASET = DATA_FOLDER+"train_neg.txt"

import os
current_directory = os.getcwd()
print(current_directory) 

# Note: it seems that the data is already to lower case, so no need to apply lower() to the text
pos_data = pd.read_fwf('data/train_pos.txt', header=None, names=["text"]).drop_duplicates()
pos_data["labels"] = 1
neg_data = pd.read_fwf('data/train_neg.txt', header=None, names=["text"]).drop_duplicates().apply(lambda x: x.str.lower())
neg_data["labels"] = 0

/Users/teframartin/Informatik/ML/project2


### Cleaning Data

In [5]:
from collections import Counter
import re
import string
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
pd.set_option('display.max_colwidth',100)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# A word that is so common that there is no need to use it in a search
ENGLISH_STOP_WORDS = nltk.corpus.stopwords.words('english')

# Adding few extra stop word
ENGLISH_STOP_WORDS = ENGLISH_STOP_WORDS + ['im', 'dont','dunno', 'cant', ' 2 ', "'s", ' u ', ' x ', 'ive', 'user']

In [7]:
#Concatenate the two training sets of positive and negative tweets
train_data = pd.concat([pos_data, neg_data], ignore_index=True)

In [8]:
# Calculate the most common words used in the set of all tweets
def get_most_common_words(txt,limit):
    return Counter(txt.split()).most_common()[:limit]

# Remove from tweets the punctuation and stop words (= a word that is so common that there is no need to use it in a search.)
def clean_tweet(tweet):
    tweet = "".join([w for w in tweet if w not in string.punctuation])
    tokens = re.split('\W+', tweet)
    tweet = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tweet

# Change any word belonging to the same word-family into a common word (changing/changes/changed.. ==> change)
def lemmatization(token_tweet):
    tweet = [wn.lemmatize(word) for word in token_tweet]
    return tweet

# Concatenate the tokennized tweet into a all text like at the beginning
def concatenate(lst):
    concatenate_tweet = ''
    for elem in lst:
        concatenate_tweet = concatenate_tweet + ' ' + elem
    return concatenate_tweet

In [9]:
# Apply the clean_tweet transformation
train_data['text'] = train_data['text'].apply(lambda x : clean_tweet(x)).apply(lambda x : lemmatization(x))

In [10]:
train_data.sample(10)

Unnamed: 0,text,labels
160157,"[readymade, tree, medium, green, 751, 25, 8, assembly, required, readytoplant, tree, lend, authe...",0
20485,"[aw, thanks, late, though, final, stage, huge, tooth, makeover, last, appointment, scheduled, to...",1
176511,"[17x25, custom, picture, frame, poster, frame, 125, wide, complete, rich, brown, wood, frame, 52...",0
170994,"[peanut, butter, really, visit, leaf, thatd, ahhhmazing, ]",0
30517,"[dress, pretty]",1
73635,"[think, emily, make, sexy, guy, ]",1
12976,"[haha, ill, picture, itll, help, calm, nerve, thanks, pizz, ]",1
162225,"[, amazonbasics, usb, 20, amale, afemale, extension, cable, 98, foot, 30, meter, url, wby]",0
165317,[],0
134867,"[, hed, get, enjoy, outdoors, lol, unfortunate, coincidence, would, gladly, put, ]",0


### Creation of the model

1. Initialize a task-specific model

In [11]:
model = ClassificationModel("roberta", "roberta-base", use_cuda=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

2. Train the model with train_model()

In [13]:
model.train_model(train_data.sample(frac=0.005), output_dir="outputs/roberta", args={"overwrite_output_dir": True, "num_train_epochs": 1})

  0%|          | 0/907 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 2/907 [00:02<21:10,  1.40s/it]
Epochs 0/1. Running Loss:    0.6771:  39%|███▉      | 45/114 [00:53<01:21,  1.19s/it]
Epoch 1 of 1:   0%|          | 0/1 [00:53<?, ?it/s]


KeyboardInterrupt: 

3. Evaluate the model

In [None]:
# sample an other part of the train full dataset and measure the accuracy
# with eval_model()
result, model_outputs, wrong_predictions = model.eval_model(train_data.sample(frac=0.005))

In [None]:
result, model_outputs

4. Make predictions on (unlabelled) data

In [None]:
# create the submission file on the test dataset
#  with predict()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")