In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import multiprocessing
import os
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from nltk.stem import PorterStemmer

# STOPWORDS = set(stopwords.words('english'))
pd.set_option("display.max_colwidth", 400)
pool = multiprocessing.Pool()

def lemmatize_sentence(sentence):
    lem_sent = []
    wordnet_lemmatizer = WordNetLemmatizer()
    sentence_words = nltk.word_tokenize(sentence)
    punctuations = "?:!.,;"
    for word in sentence_words:
        if word in punctuations:
            sentence_words.remove(word)

    for word in sentence_words:
        lem_sent.append(wordnet_lemmatizer.lemmatize(word))
    return " ".join(lem_sent)

def stem_sentence(sentence):
    stem_sent=[]
    porter = PorterStemmer()
    sentence_words = nltk.word_tokenize(sentence)
    for word in sentence_words:
        stem_sent.append(porter.stem(word))
    return " ".join(stem_sent)



def remove_unwanted_text(text):
    
    if text.startswith("Re: "):
        text = text.replace("Re:", "")
    new_text = text.replace("\n", ". ")
    new_text = remove_mentions(new_text)
    new_text = new_text.lower()
    new_text = remove_html(new_text)
    new_text = remove_link(new_text)
    new_text = re.sub(r'[^\w\s]+', ' ', new_text)
    return new_text

def remove_link(text):
    new_text = re.sub(r"http\S+", " ", text)
    new_text = re.sub(r"www.\S+", " ", new_text)
    return new_text

def remove_mentions(text):
    new_text = re.sub(r"@\S+", " ", text)
    new_text = re.sub(r'RT[\s]+', " ", new_text)
    return new_text

def remove_html(text):
    return re.sub(r"<[^<>]+>", " ", text)

def clean_text(data):
    pool = multiprocessing.Pool(processes=4)
    data['clean_text'] = pool.map(remove_unwanted_text, data['text'])
    return data


In [None]:
sentiment_train_df = pd.read_csv("/kaggle/input/generic-sentiment-multidomain-sentiment-dataset/generic_sentiment_dataset_50k.csv")
sentiment_test_df = pd.read_csv("/kaggle/input/generic-sentiment-multidomain-sentiment-dataset/generic_sentiment_dataset_10k.csv")

In [None]:
sentiment_train_df.head()

In [None]:
sentiment_train_df = clean_text(sentiment_train_df)
sentiment_test_df = clean_text(sentiment_test_df)

In [None]:
sentiment_train_df.sample(20)

In [None]:
train_text = sentiment_train_df.text
test_text = sentiment_test_df.text

In [None]:
all_text = pd.concat([train_text, test_text])

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)

In [None]:
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [None]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)

In [None]:
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [None]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [None]:
train_target = sentiment_train_df['label']

In [None]:
train_features.shape

In [None]:
classifier = LogisticRegression(C=0.1, solver='sag')
cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=5, scoring='roc_auc'))

In [None]:
cv_score

In [None]:
classifier = LogisticRegression(C=0.1, solver='sag')
classifier.fit(train_features, train_target)

In [None]:
from sklearn.metrics import f1_score, accuracy_score
predict_test = classifier.predict(test_features)
print("f1 score: {}".format(f1_score(predict_test, sentiment_test_df.label, average='micro')))
print("Accuracy score: {}".format(accuracy_score(predict_test, sentiment_test_df.label)))


In [None]:
multiprocessing.Pool?
