In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("stopwords")

df = pd.read_csv("C:/ML/python/data/iran.csv",delimiter=',')
df.head(10)

In [None]:

df.info()
df.isna().sum()
df.dropna(inplace=True)
df.duplicated().sum()
df.drop_duplicates(inplace=True)

df['label'].value_counts().plot(kind='pie',autopct='%1.1f%%')

In [None]:
df['Comments'] = df['Comments'].str.lower()
import re

def remove_html_tags(text):
    pattern = r'<.*?>'
    text = re.sub(pattern,"",text)
    return text


df['Comments'] = df['Comments'].apply(remove_html_tags)
df['Comments'].head(10)

In [None]:
import string

PUNC = string.punctuation

def remove_punctuations(text):
    return text.translate(str.maketrans("","",PUNC))

df['Comments'] = df['Comments'].apply(remove_punctuations)

df['Comments'] = df['Comments'].str.replace(r'\d', '')

In [None]:
sw = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    tokens = word_tokenize(text)
    cleaned_tokens = [word for word in tokens if word.lower() not in sw]
    return " ".join(cleaned_tokens)

df['Comments'] = df['Comments'].apply(remove_stopwords)
df['Comments'].head(10)

In [None]:
lemma = WordNetLemmatizer()

def lemmatization(text):
    tokens = word_tokenize(text)
    lemma_tokens = [lemma.lemmatize(token) for token in tokens]
    return " ".join(lemma_tokens)


df['Comments'] = df['Comments'].apply(lemmatization)
df['Comments'].head(10)

In [None]:
text = " ".join(i for i in df['Comments'])

from wordcloud import WordCloud

wc = WordCloud(colormap="Set2",collocations=False).generate(text)
plt.imshow(wc,interpolation="gaussian")
plt.axis("off")
plt.show()


In [None]:
from textblob import TextBlob

blob = TextBlob(text)

from nltk.probability import FreqDist

most_frequent_words = FreqDist(blob.words)
top_50_words = most_frequent_words.most_common(50)
print("top 50 most common comments: ",top_50_words)

In [None]:
def polarity(text):
    return TextBlob(text).polarity



df['polarity'] = df['Comments'].apply(polarity)



def sentiment(label):
    if label <0:
        return "Negative"
    elif label == 0:
        return "Neutral"
    elif label >= 0:
        return "Positive"


df['sentiment'] = df['polarity'].apply(sentiment)

df['sentiment'].value_counts().plot(kind='pie',autopct='%1.1f%%')

In [None]:
####Cats > Gats

import seaborn as sns

sns.barplot(x=df['sentiment'],y=df['label'])
plt.title("label vs sentiment of youtube comment")
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

from sklearn.model_selection import train_test_split
X = df['Comments']
X = cv.fit_transform(X).toarray()
y = df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=42)

from sklearn.preprocessing import MinMaxScaler

ms = MinMaxScaler()

X_train = ms.fit_transform(X_train)
X_test = ms.transform(X_test)



from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
GNB = GaussianNB()
MNB = MultinomialNB()
BNB = BernoulliNB()

from sklearn.metrics import accuracy_score,classification_report

def model_(X_train,X_test,y_train,y_test,model):
    model = model.fit(X_train,y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test,pred)
    clf_rpt = classification_report(y_test, pred)
    print(f'{model.__class__.__name__}, --ACC-- {acc*100:.2f}%; --Classification Report-- {clf_rpt}')
    return pred

lr_pred = model_(X_train, X_test, y_train, y_test, lr)
GNB_pred = model_(X_train, X_test, y_train, y_test,GNB)
MNB_pred = model_(X_train, X_test, y_train, y_test, MNB)
BNB_pred = model_(X_train, X_test, y_train, y_test, BNB)

In [None]:
X = df['Comments']
y = df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.15,random_state=1)


from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print(len(word_index))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


max_length = 0
for sequence in X_train:
    sequence_length = len(sequence)
    if sequence_length > max_length:
        max_length = sequence_length


print(max_length)

In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)



from tensorflow.keras.utils import pad_sequences

X_train = pad_sequences(X_train,maxlen=max_length,padding='post')
X_test = pad_sequences(X_test,maxlen=max_length,padding='post')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Bidirectional,Dropout,SpatialDropout1D

RNN = Sequential()
RNN.add(Embedding(input_dim=(len(word_index) + 1),output_dim=150,input_length=max_length))
RNN.add(SpatialDropout1D(0.3))
RNN.add(Bidirectional(LSTM(50,dropout=0.1,recurrent_dropout=0.1)))
RNN.add(Dropout(0.2))
RNN.add(Dense(100,activation='relu'))
RNN.add(Dropout(0.1))
RNN.add(Dense(2,activation='sigmoid'))
RNN.summary()
RNN.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
history = RNN.fit(X_train,y_train,batch_size=32,epochs=10,validation_split=0.2)



plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel("accuracy")
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()



plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
#bad

from sklearn.metrics import roc_auc_score
pred = RNN.predict(X_test)
roc = roc_auc_score(y_test,pred)
print(f"roc_auc score: {roc*100:.2f}")

#### This is dog shit


In [None]:
#################### Trash