In [1]:
#Upgrade dependencies
!pip install --upgrade pip
!pip install --upgrade scikit-learn

Requirement already up-to-date: pip in /Users/sazaracs/anaconda3/lib/python3.7/site-packages (20.2.3)
Requirement already up-to-date: scikit-learn in /Users/sazaracs/anaconda3/lib/python3.7/site-packages (0.23.2)


# __1-Read the Training Data:__

In [2]:
import pandas as pd

df = pd.read_csv('training.csv', encoding='utf-8', header=0)

In [3]:
# Let's take a look at this data in more detail and then start working. 
# Remember 'human_tag' is our target variable/column
df.head(5)

Unnamed: 0,ID,doc_id,text,date,star_rating,title,human_tag
0,47490,15808037321,"I ordered a sample of the Dietspotlight Burn, ...",6/25/2018 17:51,1,DO NOT BUY!,0
1,16127,16042300811,This coffee tasts terrible as if it got burnt ...,2/8/2018 15:59,2,Coffee not good,0
2,51499,16246716471,I've been buying lightly salted Planters cashe...,3/22/2018 17:53,2,"Poor Quality - Burnt, Shriveled Nuts With Blac...",0
3,36725,14460351031,This product is great in so many ways. It goes...,12/7/2017 8:49,4,"Very lovey product, good sunscreen, but strong...",0
4,49041,15509997211,"My skin did not agree with this product, it wo...",3/21/2018 13:51,1,Not for everyone. Reactions can be harsh.,1


In [4]:
# Let's see how many nan values in our data frame
print(df.isna().sum())

ID             0
doc_id         0
text           6
date           0
star_rating    0
title          1
human_tag      0
dtype: int64


# __2-Stop Word Removal and Stemming__

In [5]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Let's get a list of stop words from the NLTK library
stop = stopwords.words('english')

# These words are important for our problem. We don't want to remove them.
excluding = ['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# New stop word list
stop_words = [word for word in stop if word not in excluding]

snow = SnowballStemmer('english')

def process_text(texts): 
    final_text_list=[]
    for sent in texts:
        
        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ""
            
        filtered_sentence=[]
        
        sent = sent.lower() # Lowercase 
        sent = sent.strip() # Remove leading/trailing whitespace
        sent = re.sub('\s+', ' ', sent) # Remove extra space and tabs
        sent = re.compile('<.*?>').sub('', sent) # Remove HTML tags/markups:
        
        for w in word_tokenize(sent):
            # We are applying some custom filtering here, feel free to try different things
            # Check if it is not numeric and its length>2 and not in stop words
            if(not w.isnumeric()) and (len(w)>2) and (w not in stop_words):  
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence) #final string of cleaned words
 
        final_text_list.append(final_string)
        
    return final_text_list

# __3-Splitting the training dataset into training and validation__
* Features: Title, text, star_rating
* Target: human_tag

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df[["title", "text", "star_rating"]],
                                                  df["human_tag"].values,
                                                  test_size=0.1,
                                                  random_state=241,
                                                  shuffle=True)

__Let's process title and text fields__

In [7]:
print("Pre-process training dataset")
X_train["title"] = process_text(X_train["title"].values) 
X_train["text"] = process_text(X_train["text"].values) 

print("Pre-process validation dataset")
X_val["title"] = process_text(X_val["title"].values) 
X_val["text"] = process_text(X_val["text"].values) 

Pre-process training dataset
Pre-process validation dataset


# __4-Pipeline and ColumnTransformer__

In [8]:
# Grab model features/inputs and target/output
numerical_features = ['star_rating']

text_features = ['title',
                 'text']

model_features = numerical_features + text_features
model_target = 'human_tag'

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

### COLUMN_TRANSFORMER ###
##########################

# Preprocess the numerical features
numerical_processor = Pipeline([
    ('num_scaler', MinMaxScaler())
])

# Preprocess 1st text feature
text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=300))
])

# Preprocess 2nd text feature (larger vocabulary)
text_precessor_1 = Pipeline([
    ('text_vect_1', CountVectorizer(binary=True, max_features=1000))
])

# Combine all data preprocessors from above (add more, if you choose to define more!)
# For each processor/step specify: a name, the actual process, and finally the features to be processed
data_preprocessor = ColumnTransformer([
    ('numerical_pre', numerical_processor, numerical_features),
    ('text_pre_0', text_processor_0, text_features[0]),
    ('text_pre_1', text_precessor_1, text_features[1])
]) 

### PIPELINE ###
################

# Pipeline desired all data transformers, along with an estimator at the end
# Later you can set/reach the parameters using the names issued - for hyperparameter tuning, for example
pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('logistic_regression', LogisticRegression(class_weight='balanced', max_iter=250))
])

# Visualize the pipeline
# This will come in handy especially when building more complex pipelines, stringing together multiple preprocessing steps
from sklearn import set_config
set_config(display='diagram')
pipeline

# __5-Training and validation:__
* Train using X_train and y_train
* Use validation data (X_val and y_val) to see how well it works.

In [10]:
# Fit the Pipeline to training data
pipeline.fit(X_train, y_train)

In [11]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = pipeline.predict(X_val)
print(confusion_matrix(y_val, val_predictions))
print(classification_report(y_val, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val, val_predictions))

[[4157 1181]
 [ 183  793]]
              precision    recall  f1-score   support

           0       0.96      0.78      0.86      5338
           1       0.40      0.81      0.54       976

    accuracy                           0.78      6314
   macro avg       0.68      0.80      0.70      6314
weighted avg       0.87      0.78      0.81      6314

Accuracy (validation): 0.7839721254355401


# __6-Getting predictions on test data and saving results__:
* Read the test data, use the same process_text function on it
* Pass the data into your pipeline and make predictions

__Reading and imputing:__

In [12]:
# Read the test data (It doesn't have the human_tag label, we are trying to predict that :D )
test_df = pd.read_csv('test.csv', encoding='utf-8', header=0)
test_df.head()

Unnamed: 0,ID,doc_id,text,date,star_rating,title
0,62199,15449606311,"Quality of material is great, however, the bac...",3/7/2018 19:47,3,great backpack with strange fit
1,76123,15307152511,The product was okay but wasn't refined campho...,43135.875,2,Not refined
2,78742,12762748321,I normally read the reviews before buying some...,42997.37708,1,"Doesnt work, wouldnt recommend"
3,64010,15936405041,These pads are completely worthless. The light...,43313.25417,1,The lighter colored side of the pads smells li...
4,17058,13596875291,The saw works great but the blade oiler does n...,12/5/2017 20:17,2,The saw works great but the blade oiler does n...


In [13]:
print("Pre-process test dataset")
test_df["title"] = process_text(test_df["title"].values) 
test_df["text"] = process_text(test_df["text"].values) 

Pre-process test dataset


In [14]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the test dataset
test_predictions = pipeline.predict(test_df[["title", "text", "star_rating"]])

Save the result. Don't forget to submit to the leaderboard!

In [15]:
import pandas as pd

result_df = pd.DataFrame()
result_df["ID"] = test_df["ID"]
result_df["human_tag"] = test_predictions

result_df.to_csv("project_result.csv", encoding='utf-8', index=False)