In [None]:
import pandas as pd
import numpy as np
import re
import string

#preproccessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from sklearn.model_selection import train_test_split 

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_excel("/kaggle/input/twitter-dataset/Tweets1.xlsx")

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
# df = df.dropna(subset=["text"])
df = df.dropna()

In [None]:
# df1 = df[df.isna().any(axis=1)]
# print (df1)

In [None]:
# df["selected_text"] = np.where(df["selected_text"].isnull() , df["text"] , df["selected_text"])

In [None]:
df.isnull().sum()

In [None]:
# print(df.loc[[1157]],df.loc[[446]])

**Text Preprocessing**

In [None]:
Z = df["selected_text"].astype("string")

In [None]:
def preprocess_tweets(tweet):

    #converting all tweest to lowercase
    tweet =  tweet.lower()

    #removing urls using regx
    tweet = re.sub(r"http\S+|www\S+|https\S+","",tweet , flags = re.MULTILINE) #MULTILINE flag checks for the regx at the start of the string or even inside the string
      
    #remove punctuations
    tweet =  tweet.translate(str.maketrans("","",string.punctuation))

    #remove # and @
    tweet = re.sub(r"\@\w+|\#" , "",tweet)

    #remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [word for word in tweet_tokens if word not in stop_words]
    
    # stemming
    ps = PorterStemmer()
    stem_words = [ps.stem(words) for words in filtered_words]

    #lemmatization
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(words , pos = "a") for words in stem_words]

    return " ".join(lemma_words)

df["text_preprocessed"] = Z.apply(preprocess_tweets)

In [None]:
# df["text_preprocessed"]

In [None]:
# print(df["text"][446])
# print(df["text_preprocessed"][446])

In [None]:
# df["sentiment"]= df["sentiment"].apply(lambda x:1 if x=="positive"else x)
# df["sentiment"]= df["sentiment"].apply(lambda x:0 if x=="negative"else x)
# df["sentiment"]= df["sentiment"].apply(lambda x:2 if x=="neutral"else x)

**EDA**

In [None]:
df.head(10)

In [None]:
print(df.dtypes)

In [None]:
df["sentiment"].value_counts()

In [None]:
labels = "Positive", "Negative" , "Neutral"
colors = sns.color_palette("bright")[5:8]
sizes = [df.sentiment[df["sentiment"]=="positive"].count(), df.sentiment[df["sentiment"]=="negative"].count(),df.sentiment[df["sentiment"]=="neutral"].count()]
explode = (0, 0.1,0)
fig1, ax1 = plt.subplots(figsize=(8, 6))
ax1.pie(sizes, labels=labels, autopct="%1.1f%%",
  startangle=200,colors = colors,explode=explode,wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'},
       textprops={'size': 'x-large'})
ax1.axis("equal")
plt.title("Proportion of Positive , Negative and Neutral Sentiments", size = 15)
plt.savefig("pie_chart.png")
plt.show()

In [None]:
#top 10 frequent words in tweets
freq_dist = FreqDist(df["text_preprocessed"])
print(freq_dist.most_common(10))

In [None]:
#combining all the sentences into a single single sentence
all_words = " ".join([sentence for sentence in df["text_preprocessed"]])

#wordcloud
wordcloud = WordCloud(width = 800 , height = 500,random_state = 42 ,max_font_size = 100).generate(all_words)

#plotting graph
plt.figure(figsize = (15,8))
plt.imshow(wordcloud , interpolation = "bilinear")
plt.axis("off")
plt.savefig("all_words.png")
plt.show()


In [None]:
#top 10 frequent words in positive tweets

freq_dist_pos = FreqDist((df["text_preprocessed"][df["sentiment"] == "positive"]))
print(freq_dist_pos.most_common(10))

In [None]:
#frequent words visualization for positive words
all_words_positive = " ".join([sentence for sentence in df["text_preprocessed"][df["sentiment"]== "positive"]])

#wordcloud
wordcloud_positive = WordCloud(width = 800 , height = 500,random_state = 42 ,max_font_size = 100).generate(all_words_positive)

#plotting graph
plt.figure(figsize = (15,8))
plt.imshow(wordcloud_positive , interpolation = "bilinear")
plt.axis("off")
plt.savefig("Pos_words.png")
plt.show()


In [None]:
#top 10 frequent words in negative tweets
freq_dist_negative = FreqDist((df["text_preprocessed"][df["sentiment"] == "negative"]))
print(freq_dist_negative.most_common(10))

In [None]:
#frequent words visualization for negative words
all_words_negative = " ".join([sentence for sentence in df["text_preprocessed"][df["sentiment"]== "negative"]])

#wordcloud
wordcloud_negative = WordCloud(width = 800 , height = 500,random_state = 42 ,max_font_size = 100).generate(all_words_negative)

#plotting graph
plt.figure(figsize = (15,8))
plt.imshow(wordcloud_negative , interpolation = "bilinear")
plt.axis("off")
plt.savefig("neg_words.png")
plt.show()

In [None]:
#top 10 frequent words in neutral tweets
freq_dist_neutral = FreqDist((df["text_preprocessed"][df["sentiment"] == "neutral"]))
print(freq_dist_neutral.most_common(10))

In [None]:
#frequent words visualization for neutral words
all_words_neutral = " ".join([sentence for sentence in df["text_preprocessed"][df["sentiment"]== "neutral"]])

#wordcloud
wordcloud_neutral = WordCloud(width = 800 , height = 500,random_state = 42 ,max_font_size = 100).generate(all_words_neutral)

#plotting graph
plt.figure(figsize = (15,8))
plt.imshow(wordcloud_neutral , interpolation = "bilinear")
plt.axis("off")
plt.savefig("neutral_words.png")
plt.show()

**Feature Extraction**

In [None]:
x = df["text_preprocessed"]
y = df["sentiment"]

In [None]:
print(x.head(10))
print(y.head(10))

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,random_state = 42,test_size = 0.2)  

In [None]:
print("X_train: ",X_train.shape)
print("Y_train: ",Y_train.shape)
print("X_test: ",X_test.shape)
print("Y_test: ",Y_test.shape)

In information retrieval, TF-IDF, or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The TF-IDF value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general.

TF-IDF is one of the most popular term-weighting schemes today. A survey conducted in 2015 showed that 83% of text-based recommender systems in digital libraries use TF-IDF.

**TF-IDF Vectorizer**

In [None]:
# cv = CountVectorizer(ngram_range = (1,2))
cv = TfidfVectorizer(ngram_range = (1,1),lowercase = True,analyzer = "word")
X_train_vec = cv.fit_transform(X_train)
X_test_vec = cv.transform(X_test)

**LOGISTIC REGRESSION**

In [None]:
lr = LogisticRegression()
lr.fit(X_train_vec,Y_train)

**Accuracy score for LR**

In [None]:
lr_score = lr.score(X_test_vec, Y_test)
print("Results for Logistic Regression with CountVectorizer")
print(lr_score)

In [None]:
#Predicting the labels for test data
y_pred_lr = lr.predict(X_test_vec)

In [None]:
#create confusion matrix
# lr_matrix = metrics.confusion_matrix(Y_test, y_pred_lr)
# index = ["negative","positive","neutral"]
# lr_matrix = pd.DataFrame(lr_matrix,columns = index,index = index)
# print("\t\tPredicted\nActual")
# print(lr_matrix)

In [None]:
lr_confusion_matrix = pd.crosstab(Y_test, y_pred_lr, rownames=['Actual'], colnames=['Predicted'])
print (lr_confusion_matrix)

**KNN**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_vec , Y_train)

**Accuracy score for KNN**

In [None]:
knn_score = knn.score(X_test_vec , Y_test)
print(knn_score)

In [None]:
y_pred_knn = knn.predict(X_test_vec)

In [None]:
knn_confusion_matrix = pd.crosstab(Y_test, y_pred_knn, rownames=['Actual'], colnames=['Predicted'])
print (knn_confusion_matrix)

**DECISION TREE CLASSIFIER**

In [None]:
Dtc = DecisionTreeClassifier()
Dtc.fit(X_train_vec , Y_train)

**Accuracy Score For Dtc**

In [None]:
Dtc_score = Dtc.score(X_test_vec,Y_test)
print(Dtc_score)

In [None]:
y_pred_Dtc = Dtc.predict(X_test_vec)

In [None]:
Dtc_confusion_matrix = pd.crosstab(Y_test, y_pred_Dtc, rownames=['Actual'], colnames=['Predicted'])
print (Dtc_confusion_matrix)