### --------Import the libraries----------

In [1]:
# utilities
import re
import numpy as np
import pandas as pd
# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# nltk
import nltk
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import string
import warnings 
warnings.filterwarnings("ignore")

### ---------Exploratory data analysis----------

In [2]:
# Importing the dataset
DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING = "ISO-8859-1"
df = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
df.head()

In [3]:
df.sample(10)

In [4]:
df.shape

In [5]:
df.info()

In [6]:
df.dtypes

In [7]:
df.isna().sum()

In [8]:
df.target.unique()

In [9]:
df.target.nunique()

In [10]:
# Plotting the distribution for dataset.
ax = df.groupby('target').count().plot(kind='bar', title='Distribution of data',legend=False)
ax.set_xticklabels(['Negative','Positive'], rotation=0)


In [11]:
sns.countplot(x='target', data=df)

In [12]:
temp = df.groupby('target').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [13]:
from wordcloud import WordCloud,STOPWORDS

In [14]:
new_df=df[df['target']==0]
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [15]:
new_df=df[df['target']==4]
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [16]:
plt.figure(figsize=(10, 5))
sns.histplot([len(s) for s in df.text], bins=50)
plt.title('Sentence Length')
plt.show()

In [17]:
# function to collect hashtags
def hashtag_extract(x):
    hashtags = []
    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)

    return hashtags

In [18]:
HT_regular = hashtag_extract(df['text'][df['target'] == 4])


HT_negative = hashtag_extract(df['text'][df['target'] == 0])

HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])

In [19]:
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [20]:
a = nltk.FreqDist(HT_negative)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

### ---------Data Preprocessing------------

In [21]:
data=df[['text','target']]

In [22]:
data['target'] = data['target'].replace(4,1)

In [23]:
data['target'].unique()

In [24]:
data_pos = data[data['target'] == 1]
data_neg = data[data['target'] == 0]

In [25]:
dataset = pd.concat([data_pos, data_neg])

#### Data cleaning

In [26]:
dataset['text']=dataset['text'].str.lower()
dataset['text'].tail()

In [27]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [28]:
contractions_dict = {"ain't": "are not","'s":" is","aren't": "are not","isn't":"is not","haven't":"have not","hasn't":"has not"}
# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)
data['text']=data['text'].apply(lambda x : expand_contractions(x))

In [29]:
data['text'] = data['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


In [30]:
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
dataset['text'] = dataset['text'].apply(lambda text: cleaning_stopwords(text))
dataset['text'].head()

In [31]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
dataset['text']= dataset['text'].apply(lambda x: cleaning_punctuations(x))
dataset['text'].tail()

In [32]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_repeating_char(x))
dataset['text'].tail()

In [33]:
def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_URLs(x))
dataset['text'].tail()

In [34]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_numbers(x))
dataset['text'].tail()

In [35]:
import nltk
nltk.download('punkt')

#### Tokenization

In [36]:
from nltk.tokenize import RegexpTokenizer
dataset['text'] = dataset['text'].apply(lambda x: nltk.word_tokenize(x))
dataset['text'].head()

#### Stemming

In [37]:
import nltk
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
dataset['text']= dataset['text'].apply(lambda x: stemming_on_text(x))
dataset['text'].head()

#### Lemmatization

In [38]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
dataset['text'] = dataset['text'].apply(lambda x: lemmatizer_on_text(x))
dataset['text'].head()

In [39]:
X = dataset.text.apply(lambda x : " ".join(x))
y = dataset.target

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.05, random_state =2022)

### --------Bag of Words----------

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=500000)
# bag-of-words feature matrix
bow_vectorizer.fit(X_train)

In [42]:
bow_X_test  = bow_vectorizer.transform(X_test)
bow_X_train = bow_vectorizer.transform(X_train)

In [43]:
print('No. of feature_words: ', len(bow_vectorizer.get_feature_names()))

### -----TF IDF------

In [44]:
tf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
tf_vectorizer.fit(X_train)
print('No. of feature_words: ', len(tf_vectorizer.get_feature_names()))

In [45]:
%%time
tf_X_train = tf_vectorizer.transform(X_train)
tf_X_test  = tf_vectorizer.transform(X_test)

### ---------Evaluation function-----------

In [46]:
def model_Evaluate(model,X_test):
    # Predict values for Test dataset
    y_pred = model.predict(X_test)
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)
    categories = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]
    labels = [f'{v1}n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
    xticklabels = categories, yticklabels = categories)
    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values" , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

### -----model_building--------

#### BernoulliNB

In [47]:
BNBmodel = BernoulliNB()

In [48]:

BNBmodel.fit(bow_X_train, y_train)
model_Evaluate(BNBmodel , bow_X_test)
y_pred_bow = BNBmodel.predict(bow_X_test)


In [49]:
BNBmodel.fit(tf_X_train, y_train)
model_Evaluate(BNBmodel , tf_X_test)
y_pred_tf = BNBmodel.predict(tf_X_test)

In [50]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred_bow)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()

#### SVC

In [51]:
SVCmodel = LinearSVC()

In [52]:

SVCmodel.fit(bow_X_train, y_train)
model_Evaluate(SVCmodel , bow_X_test)
y_pred_bow2 = SVCmodel.predict(bow_X_test)

In [53]:
SVCmodel.fit(tf_X_train, y_train)
model_Evaluate(SVCmodel , tf_X_test)
y_pred_tf2 = SVCmodel.predict(tf_X_test)

In [54]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred_tf2)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()

#### Logistic Regression

In [55]:
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)


In [56]:
LRmodel.fit(bow_X_train, y_train)
model_Evaluate(LRmodel ,bow_X_test)
y_pred_bow3 = LRmodel.predict(bow_X_test)

In [57]:
LRmodel.fit(tf_X_train, y_train)
model_Evaluate(LRmodel ,tf_X_test)
y_pred_tf3 = LRmodel.predict(tf_X_test)

In [58]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred_tf3)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()

### --------EDA on scraped data---------

In [59]:
uk_pal = pd.read_csv("../input/palestine-ukraine/Palestine_Ukraine.csv")

In [60]:
uk_pal.head()

In [61]:
uk_pal.shape

#### Clean and Transform the data

In [62]:
data_transform=uk_pal['text'].str.lower()
data_transform = data_transform.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
data_transform = data_transform.apply(lambda x : expand_contractions(x))
data_transform = data_transform.apply(lambda text: cleaning_stopwords(text))
data_transform = data_transform.apply(lambda x: cleaning_punctuations(x))
data_transform = data_transform.apply(lambda x: cleaning_repeating_char(x))
data_transform = data_transform.apply(lambda x: cleaning_URLs(x))
data_transform = data_transform.apply(lambda x: cleaning_numbers(x))
data_transform = data_transform.apply(lambda x: nltk.word_tokenize(x))
data_transform = data_transform.apply(lambda x: stemming_on_text(x))
data_transform = data_transform.apply(lambda x: lemmatizer_on_text(x))
data_transform = data_transform.apply(lambda x : " ".join(x))
data_transform = tf_vectorizer.transform(data_transform)

#### Apply the logistic regression classifier

In [63]:
uk_pal['sentiment'] = LRmodel.predict(data_transform)

In [64]:
uk_pal.head()

In [65]:
uk_pal.to_csv('UK_Pal.csv', index=False)