In [2]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords    
stop_words = set(stopwords.words('english'))

from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from collections import Counter

import textstat
from lexicalrichness import LexicalRichness

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
# Fake:1, Real: 0

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [56]:
from nltk.stem import PorterStemmer

In [4]:
def load_trainset(csv_file):
    df_raw = pd.read_csv(csv_file, sep='\n')
    df_raw.rename(columns = {'text\tlabel':'text'}, inplace = True)
    df_raw = pd.DataFrame(df_raw.text.str.rsplit('\t',1).tolist(),
                          columns = ['text', 'label'])
    return df_raw

In [17]:
df = load_trainset('train.csv')

### Preprocessing text for Vectorizor

In [57]:
"""
1) Lowercase the text: This preprocessing step is done so words can later be cross checked with the stopwords and pos_tag dictionaries.
2) Remove words with just one letter
3) Remove words that contain numbers
4) Tokenize the text and remove punctuation
5) Remove stopwords: Proper analysis of text usually relies on the most recurring words. Stopwords including words as “the”, “as” and “and” appear a lot in a text, but eech word doesn’t really give relevant explanation, so one of NLP comon practices is to remove such words.
6) Remove empty tokens: After tokenization, we have to make sure all tokens taken into account contribute to the label prediction.
7) Lemmatize the text: In order to normalize the text, we apply lemmatization. In this way, words with the same root are processed equally e.g. when the words “took” or “taken” are found in the text, they are lemmatized to “take”, infinitive of the verb.
"""

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def remove_URL(t):
    return re.sub(r"http\S+","", t)

ps = PorterStemmer()

def preprocess(text):

    text = remove_URL(text)
    text = text.lower().replace("-"," ").replace("."," ").replace(":"," ")
    custom_s = string.punctuation.replace("-","”")+"‘"+"“"+"."+","
    for i in custom_s:
        text = text.replace(i,"")
    text = [t for t in text.split(" ") if len(t) > 1] # a list of words
    text = [word for word in text if not any(c.isdigit() for c in word)]    
    text = [x.strip() for x in text if x not in stop_words]
    text = [t for t in text if len(t) > 0] # remove empty token
    text = [ps.stem(x) for x in text]
#     pos_tags = pos_tag(text)
#     text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    text = " ".join(text)
    return text

In [63]:
df.text = df.text.apply(preprocess)

### New Pipeline model

In [68]:
X_train,X_test,y_train,y_test = train_test_split(df.text, 
                                                 df.label, 
                                                 test_size=0.2, 
                                                 random_state=0)

In [70]:
trigram_vectorizer = CountVectorizer(analyzer = "word", ngram_range=(1,3))

classifier = LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [71]:
pipe = Pipeline([('vect', CountVectorizer(analyzer = "word", ngram_range=(1,3))),
                 ('tfidf', TfidfTransformer(norm="l2")),
                 ('model', classifier)
                  ])
# Fitting the model
model = pipe.fit(X_train, y_train)
# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))







accuracy: 81.36%


### Predict the test set

In [72]:
def load_testset(csv_file):
    df_raw = pd.read_csv(csv_file, sep='\n')
    df_raw.rename(columns = {'id\ttext':'text'}, inplace = True)
    df_raw = pd.DataFrame(df_raw.text.str.split('\t',1).tolist(),
                          columns = ['id', 'text'])
    return df_raw

In [73]:
test_df = load_testset('test.csv')

In [74]:
test_df.text = test_df.text.apply(preprocess)

In [75]:
test_df.text[20]

'best song year best track matter rank fader stream appl music spotifi'

In [76]:
test_pred = model.predict(test_df.text)

In [77]:
test_df['label'] = test_pred

In [78]:
test_df.label.value_counts()

0        817
1        429
label      1
Name: label, dtype: int64

In [79]:
sub_test = test_df[['id','label']]

In [80]:
sub_test.to_csv("sub_0811B.csv",index = False)