
# **POS tags for words (With pipeline)**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.chunk import ne_chunk
import string
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

df = pd.read_csv("/content/Twitter_Data2 (1).csv")

documents=[]
column_name = "clean_text"
for value in df[column_name]:
    documents.append(value)

# print(documents)
# print(documents[0])

ct=df["clean_text"].values.tolist()
documents = [x for x in ct if str(x) != 'nan']

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
terms = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()
df_tfidf = pd.DataFrame(tfidf_scores, columns=terms)


def preprocess_and_pos_tags(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    cleaned_tokens = [token for token in lemmatized_tokens
            if not token.isdigit() and not token in string.punctuation]
    lowercase_tokens = [token.lower() for token in cleaned_tokens]
    pos_tags = pos_tag(lowercase_tokens)
    named_entities = ne_chunk(pos_tags)

    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    adverbs = [word for word, pos in pos_tags if pos.startswith('RB')]
    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]

    # print the preprocessed text
    print("Original text:", text)
    print("Preprocessed tokens:", lowercase_tokens)
    print("POS tags:", pos_tags)
    print("Named entities:", named_entities)

    print("Verbs:", verbs)
    print("Nouns:", nouns)
    print("Adverbs:", adverbs)
    print("Adjectives:", adjectives)

for i, doc in enumerate(documents):
    print(f"Document {i+1}:")

    preprocess_and_pos_tags(documents[i])
    print("     tfid    ")
    doc_tfidf = df_tfidf.iloc[i]
    doc_tfidf = doc_tfidf[doc_tfidf != 0]  # Remove terms with TF-IDF score of 0
    print(doc_tfidf)
    print()
    print("---------------xxxxxxxxxxxxxxxxxxxxx--------------------")


print("overall cosine similarity")
print(df_tfidf)

similarity_matrix = cosine_similarity(df_tfidf)
print("cosine similarity for all docs")
print(similarity_matrix)
similarity_matrix.shape

# **Without pipeline**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.chunk import ne_chunk
import string
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

df = pd.read_csv("/content/Twitter_Data2 (1).csv")

documents=[]
column_name = "clean_text"
for value in df[column_name]:
    documents.append(value)

# print(documents)
# print(documents[0])

ct=df["clean_text"].values.tolist()
documents = [x for x in ct if str(x) != 'nan']

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
terms = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()
df_tfidf = pd.DataFrame(tfidf_scores, columns=terms)


def preprocess_and_pos_tags(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    named_entities = ne_chunk(pos_tags)

    # verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
    # nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    # adverbs = [word for word, pos in pos_tags if pos.startswith('RB')]
    # adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]

    print("POS tags:", tokens)

    # print("Verbs:", verbs)
    # print("Nouns:", nouns)
    # print("Adverbs:", adverbs)
    # print("Adjectives:", adjectives)

for i, doc in enumerate(documents):
    print(f"Document {i+1}:")

    preprocess_and_pos_tags(documents[i])
    print("     tfid    ")
    doc_tfidf = df_tfidf.iloc[i]
    doc_tfidf = doc_tfidf[doc_tfidf != 0]  # Remove terms with TF-IDF score of 0
    print(doc_tfidf)
    print()
    print("---------------xxxxxxxxxxxxxxxxxxxxx--------------------")

print("overall cosine similarity")
print(df_tfidf)

similarity_matrix = cosine_similarity(df_tfidf)
print("cosine similarity for all docs")
print(similarity_matrix)
similarity_matrix.shape

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Document 1:
POS tags: ['talk', 'all', 'the', 'nonsense', 'and', 'continue', 'all', 'the', 'drama', 'will', 'vote', 'for', 'modi']
     tfid    
all         0.518202
and         0.173166
continue    0.349752
drama       0.349752
for         0.173166
modi        0.108994
nonsense    0.349752
talk        0.316296
the         0.289423
vote        0.183494
will        0.274145
Name: 0, dtype: float64

---------------xxxxxxxxxxxxxxxxxxxxx--------------------
Document 2:
POS tags: ['what', 'did', 'just', 'say', 'vote', 'for', 'modi', 'welcome', 'bjp', 'told', 'you', 'rahul', 'the', 'main', 'campaigner', 'for', 'modi', 'think', 'modi', 'should', 'just', 'relax']
     tfid    
bjp           0.196892
campaigner    0.251193
did           0.210116
for           0.248737
just          0.420232
main          0.251193
modi          0.234840
rahul         0.227165
relax         0.251193
say           0.227165
should        0.227165
the           0.103932
think         0.227165
told          0.251193
v

(50, 50)

# **3C - POSTAGS**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

text = "The quick brown fox jumps over the lazy dog"

tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

for word, tag in pos_tags:
    print(f"{word}: {tag}")

The: DT
quick: JJ
brown: NN
fox: NN
jumps: VBZ
over: IN
the: DT
lazy: JJ
dog: NN


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
