# 1. Setup

#### Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from functools import partial
import io
import os
from IPython.display import clear_output

# Import from fastai
import fastai
from fastai import *
from fastai.text import * 

# Import from nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import bigrams, FreqDist

# Import from sklearn
from sklearn.model_selection import train_test_split

In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Get device name
torch.cuda.get_device_name(0)

'Quadro P6000'

# 2. Load the data

In [3]:
# Import the data
labeled_data = pd.read_csv('../0_datasets/labeled_data.csv')

# Have a look at the top 5 rows
labeled_data.head(n=5)

Unnamed: 0,label,text
0,4,The new rule is - \r\nif you are waiting for a...
1,3,"Flirted with giving this two stars, but that's..."
2,5,I was staying at planet Hollywood across the s...
3,2,Food is good but prices are super expensive. ...
4,1,Worse company to deal with they do horrible wo...


In [4]:
# Import unlabeled data
test_data  = pd.read_csv('../0_datasets/test_data.csv')
unlabeled_data = pd.read_csv('../0_datasets/unlabeled_data.csv')

# Combine all the data (for training the language model)
all_data = pd.DataFrame(labeled_data['text']).append(test_data)
all_data = all_data.append(unlabeled_data)

# Check the length of the dataframe
print(len(all_data))
all_data[0:5]

700000


Unnamed: 0,text
0,The new rule is - \r\nif you are waiting for a...
1,"Flirted with giving this two stars, but that's..."
2,I was staying at planet Hollywood across the s...
3,Food is good but prices are super expensive. ...
4,Worse company to deal with they do horrible wo...


# 3. Process the data

In [5]:
# Split to train and test sets
train_df, test_df = train_test_split(labeled_data, test_size=0.2, random_state=100)

In [6]:
# Print the dimensions
print(train_df.shape)
print(test_df.shape)

(40000, 2)
(10000, 2)


In [7]:
# Language model data
data_lm = (TextList.from_df(df=all_data, cols='text')
           .random_split_by_pct(0.1)
           .label_for_lm()
           .databunch(bs=192))

In [8]:
# Show the first few reviews
data_lm.show_batch()

idx,text
0,""" . \r \n \r \n p.s you can try calling in ahead to reserve a table but that s xxup only if the waiting list is short . xxmaj otherwise , you have to show up to reserve . xxmaj xxunk . \r \n \r \n \r \n xxmaj this place could do no wrong in my eyes . xxmaj rattle away you equally - clever /"
1,! xxbos i do n't see the need normally to review a grocery store but this one needs dire help . \r \n i 've lived in xxmaj lakewood for several years now and always tell myself i 'm never going back to xxmaj marc 's anytime i step in there . i normally like xxmaj marc 's but the xxmaj lakewood location is godawful . \r \n
2,would never warrant a return for a third shot . xxmaj but i dummy up and defer ; all the while repeating a please do n't suck mantra with my inside voice and hoping three 's a charm rather than three strikes you 're out . \r \n \r \n i 'm the only one drinking and i 'm pretty jazzed about sucking down some post - morning mojitos
3,"'s not . xxup sai has my business for now . xxmaj keep up the good work . xxmaj the chicken fried rice was delicious . xxmaj they have a very nice menu . a xxbos xxmaj slow drive thru , bad customer service cold food why even bother . xxmaj this might be the worst mcdonalds in the valley . xxmaj it is worth the drive to go to"
4,"in i just walk by and at the end of my workout i check in . xxmaj people want a red carpet laid out for them everywhere they go . \r \n \r \n i go to the gym just about every day , and at different times . xxmaj the evenings are a bit busy but as long as you are flexible with your exercises you will have"


In [9]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", 
                                      train_df = train_df, 
                                      valid_df = test_df, 
                                      text_cols = 'text', 
                                      label_cols = 'label',
                                      vocab=data_lm.train_ds.vocab, 
                                      bs=32)

In [10]:
# Show the first few reviews
data_clas.show_batch()

text,target
"xxbos xxmaj we went on a vacation to xxup xxunk from 2 / 8 to 2 / 14 and our regular sitter had taken some time off , so we decided to use xxmaj furry xxmaj pals . \r \n xxmaj we were put in touch with xxmaj jill , to take care of our home and finches . \r \n xxmaj right off the bat we liked",2
"xxbos xxmaj in our continuing quest to identify cool , locally owned places to eat and / or drink , xxmaj caroline and i xxunk xxmaj vintage 95 last night . \r \n \r \n xxmaj before i go further , understand that whenever i go out for eats or drinks , i have in mind a xxmaj platonic xxmaj ideal of the xxmaj bar / xxmaj pub /",4
"xxbos i guess i 'm the first person here to give them a bad review . xxmaj let me set the record straight first , i 've only done personal training sessions at this gym and they were pretty good . xxmaj what i 'm writing today is to give everybody a warning to watch out for their billing system as i had the opportunity to take a deeper look",2
"xxbos xxmaj we will xxup never return . xxmaj this was a xxup terrible , xxup awful experience from beginning to end ... service , food , everything has taken me from a patron who took out - of - towners here , to someone who will xxup never return to this location . xxmaj biggest offense - serving beef in the veggie tacos and burros - more than once",1
xxbos i 've been going to the xxmaj postal xxmaj route for 3 years . i 've never had a major problem other than waiting . xxmaj there are long lines 50 % of the time . xxmaj the staff has been quite friendly . xxmaj recently i went in to mail a fedex return item . xxmaj the fedex instructions advised that you could print your own return label,1


# 4. Fine-Tuning the Pre-Trained Model

In [11]:
# Create the language model
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)

In [12]:
# Train the language model
learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.824158,3.701355,0.296518,1:05:10


In [13]:
learn.save('first_epoch_feature_encoder')

In [14]:
learn.load('first_epoch_feature_encoder');

In [15]:
learn.unfreeze()

In [16]:
# Train the language model over 7 epochs
learn.fit_one_cycle(7, 1e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.47892,3.408178,0.332694,1:12:47
1,3.372303,3.303167,0.345615,1:12:47
2,3.305875,3.246336,0.352416,1:12:46
3,3.25847,3.207966,0.357096,1:12:48
4,3.196824,3.179644,0.360906,1:12:46
5,3.173029,3.162857,0.363069,1:12:49
6,3.146326,3.159479,0.363575,1:12:48


In [17]:
# Save the encoder
learn.save_encoder('feature_encoder')

# 5. Train the Classifier

In [61]:
# Create the classifier
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.3)
learn.load_encoder('feature_encoder')
learn.freeze()

In [62]:
# Train the classifier
learn.fit_one_cycle(1, 2e-2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.899335,0.803559,0.6504,01:33


In [63]:
learn.save('first_layer')

In [64]:
learn.load('first_layer');

In [65]:
# Train the classifier - next layer
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.790438,0.76107,0.6724,01:49


In [66]:
learn.save('second_layer')

In [67]:
learn.load('second_layer');

In [68]:
# Train the classifier - next layer
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.764529,0.739918,0.6792,02:36


In [69]:
learn.save('third_layer')

In [70]:
learn.load('third_layer');

In [71]:
# Train the classifier - all layers
learn.unfreeze()
learn.fit_one_cycle(1, slice(1e-3/(1.6**4),1e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.664066,0.741541,0.6818,03:22


In [72]:
learn.save('final_classifier')

In [73]:
learn.load('final_classifier');

# 6. Make predictions on the test data

In [74]:
# Create function to get prediction
def get_prediction(review_as_string):
    global prediction_number
    clear_output(wait=True)
    prediction_number += 1
    print(f'Predictions made: {prediction_number}')
    return(str(learn.predict(review_as_string)[0]))

In [75]:
# Create a new dataframe for final output
final_predictions = pd.DataFrame(columns=['test_id', 'label'])

# Create test_ids
test_ids = []
for i in range(1,(len(test_data) + 1)):
    test_ids.append(f'test_{i}')

# Add test_id to the dataframe
final_predictions['test_id'] = test_ids

# Have a look at the first 5 rows
print(len(final_predictions))
final_predictions[0:5]

50000


Unnamed: 0,test_id,label
0,test_1,
1,test_2,
2,test_3,
3,test_4,
4,test_5,


In [76]:
# Get predictions
prediction_number = 0
final_predictions['label'] = test_data['text'].apply(get_prediction);

Predictions made: 50000


In [77]:
# Have a look at the first 5 rows
final_predictions[0:5]

Unnamed: 0,test_id,label
0,test_1,4
1,test_2,4
2,test_3,1
3,test_4,5
4,test_5,4


In [78]:
# Save the predictions
final_predictions.to_csv('./predict_label.csv', index=False)