In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import nltk
from wordcloud import WordCloud
from nltk.stem.porter import *
from sklearn.model_selection import train_test_split


import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
xx=pd.read_csv("../input/twitter-sentiment-analysis-tah/train.csv")
train_target=xx.drop(['date', 'flag', 'user', 'text', 'id'],axis=1)

In [3]:
train=pd.read_csv("../input/twitter-sentiment-analysis-tah/train.csv").drop(["target", "flag"],axis=1)
test=pd.read_csv("../input/twitter-sentiment-analysis-tah/test.csv").drop("flag",axis=1)
sample_submission=pd.read_csv("../input/twitter-sentiment-analysis-tah/sample_submission.csv")


# Tweets Preprocessing and Cleaning

In [4]:
df_train=train.copy()
df_test=test.copy()


def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt  

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


# Removing (@user)
df_train['text'] = np.vectorize(remove_pattern)(df_train['text'], "@[\w]*")
df_test['text'] = np.vectorize(remove_pattern)(df_test['text'], "@[\w]*")


#Remove URL 

df_train['text'] = df_train['text'].apply(lambda x: remove_URL(x))
df_test['text']=df_test['text'].apply(lambda x: remove_URL(x))

# Removing Punctuations, Numbers, and Special Characters
df_train['text'] = df_train['text'].str.replace("[^a-zA-Z#]", " ")
df_test['text']=df_test['text'].str.replace("[^a-zA-Z#]", " ")


#Removing Short Words 
df_train['text'] = df_train['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
df_test['text'] = df_test['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

dummy=pd.DataFrame(index=[df_train['text'],train["text"]])
dummy.head(5)

In [5]:
# Tokenization
tokenized_text =df_train['text'].apply(lambda x: x.split())
tokenized_text_test=df_test['text'].apply(lambda x: x.split())
# Stemming 

from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenized_text1 = tokenized_text.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_text2 = tokenized_text_test.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

df_train['text']=tokenized_text1
df_test['text']=tokenized_text2

# 

In [None]:
# from nltk.stem import LancasterStemmer
# lancaster=LancasterStemmer()
# lancaster_tokenized_text = tokenized_text.apply(lambda x: [lancaster.stem(i) for i in x]) # stemming
# lancaster_tokenized_text_test=tokenized_text_test.apply(lambda x: [lancaster.stem(i) for i in x]) # stemming

# df_train['text']=lancaster_tokenized_text
# df_test['text']=lancaster_tokenized_text_test


In [6]:

# For reversing the Stream:


for i in range(len(tokenized_text1)):
    tokenized_text1[i] = ' '.join(tokenized_text1[i])

df_train['text'] = tokenized_text1



for i in range(len(tokenized_text2)):
    tokenized_text2[i] = ' '.join(tokenized_text2[i])

df_test['text'] = tokenized_text2




In [None]:
df_train['text'].shape,df_test['text'].shape

#  WordCloud
Understanding the **common words** used in the text:

In [7]:
all_words = ' '.join([text for text in df_train['text'] ])
wordcloud = WordCloud(width=800, height=500, background_color ='white',random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

#  Hashtags impact 

In [8]:
def hashtag_extract(x):
    hashtags = []
    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)

    return hashtags

HT_regular = hashtag_extract(df_train['text'])
HT_regular = sum(HT_regular,[])



# Train Trends

a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

# Test Trends


HT_regular_test = hashtag_extract(df_test['text'])
HT_regular_test = sum(HT_regular_test,[])

a = nltk.FreqDist(HT_regular_test)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
nltk.FreqDist(HT_regular)

In [None]:
nltk.FreqDist(HT_regular_test)['irememb']

# Extracting Features

**techniques** :

1. Bag-of-Words Features
2. TF-IDF
3. Word Embeddings

In [None]:
"""
Bag-of-Words Features (BoW)
"""
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow_train= bow_vectorizer.fit_transform(df_train['text'])
bow_test= bow_vectorizer.fit_transform(df_test['text'])

In [None]:
bow_test.shape

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2)
kmeans.fit(bow_train)
pred = kmeans.predict(bow_test.toarray())

sub = pd.DataFrame(columns=['id','target'])
sub['id']=test["id"]
sub["target"]=pred
sub.to_csv('sub3.csv' , index = False)



In [None]:
from sklearn.cluster import AffinityPropagation

x_train, x_val, y_train, y_val = train_test_split(bow_train, train_target, random_state=42, test_size=0.3)



In [None]:

clustering = AffinityPropagation(random_state=45)


In [None]:
clustering.fit(x_train)

In [None]:
pred=clustering.predict(x_val)
f1_score(y_val, pred)

In [None]:
# from sklearn.cluster import AffinityPropagation
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.cluster import Birch
# from sklearn.cluster import DBSCAN
# from sklearn.cluster import MiniBatchKMeans
# from sklearn.cluster import MeanShift
# from sklearn.cluster import OPTICS
# from sklearn.cluster import SpectralClustering
# from sklearn.mixture import GMM  #Gaussian Mixture Models





# model1=AffinityPropagation(random_state=45).fit(x_train)
# model2=AgglomerativeClustering().fit(x_train)
# model3=Birch(branching_factor=100, threshold=.5).fit(x_train)
# model4=DBSCAN(eps=3, min_samples=2).fit(x_train)
# model5=MiniBatchKMeans(n_clusters=2, random_state=0, batch_size=6).fit(x_train)
# model6=MeanShift(bandwidth=2).fit(x_train)
# model7=OPTICS(min_samples=2).fit(x_train)
# model8=SpectralClustering(n_clusters=2,assign_labels='discretize',random_state=45).fit(x_train)
# model9=GMM(n_components=2).fit(x_train)


# pred1=model1.predict(x_val)
# pred2=model2.predict(x_val)
# pred3=model3.predict(x_val)
# pred4=model4.predict(x_val)
# pred5=model5.predict(x_val)
# pred6=model6.predict(x_val)
# pred7=model7.predict(x_val)
# pred8=model8.predict(x_val)



# print(f1_score(y_val, pred1))  
# print(f1_score(y_val, pred2))    
# print(f1_score(y_val, pred3))    
# print(f1_score(y_val, pred4))    
# print(f1_score(y_val, pred5))    
# print(f1_score(y_val, pred6))    
# print(f1_score(y_val, pred7))    
# print(f1_score(y_val, pred8))    
# print(f1_score(y_val, pred9))    

    

In [None]:
"""
TF-IDF Features

Term frequency–inverse document frequency,
is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.”

TF: It is a measure of how frequently a term, t, appears in a document, d
IDF : is a measure of how important a term is

"""

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(df_train['text'])
tfidf_test = tfidf_vectorizer.fit_transform(df_test['text'])

**Building model using TF-IDF features**

# Splitting 

In [None]:

train_bow = bow_train[:1279999,:]
test_bow = bow_test[320000:,:]

# x_train, x_val, y_train, y_val = train_test_split(train_bow, train_target, random_state=42, test_size=0.3)



In [None]:
test_bow.toarray()

# Modeling 

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score

kmeans = KMeans(n_clusters=2)
kmeans.fit(x_train)

#predictions from kmeans
pred = kmeans.predict(x_val)

f1_score(y_val, pred)

In [None]:


train_tfidf = tfidf[:1279999,:]
test_tfidf = tfidf_test[320000:,:]

# x_train, x_val, y_train, y_val = train_test_split(train_tfidf, train_target, random_state=42, test_size=0.3)

# kmeans.fit(x_train)

# #predictions from kmeans
# pred = kmeans.predict(x_val)

# f1_score(y_val, pred) 
test_tfidf.toarray()

In [None]:
tfidf_test.toarray()

In [None]:
kmeans.fit(train_tfidf)
pred = kmeans.predict(tfidf_test.toarray())


In [None]:
sub = pd.DataFrame(columns=['id','target'])
sub['id']=test["id"]
sub["target"]=pred
sub.to_csv('sub1.csv' , index = False)

In [None]:
sub["target"].value_counts()
