<a href="https://colab.research.google.com/github/nayanshreepurbia/content_recommendation_system/blob/main/research_paper_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install pandas nltk textblob



In [9]:
pip install python-docx



In [10]:
import pandas as pd
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download NLTK resources (stopwords and punkt)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
df = pd.read_csv('/content/demo1122.csv')

In [12]:
df.head(10)

Unnamed: 0,user_id,username,bio
0,1,Plant Whisperer,"Thrives in the company of plants, finds nurtur..."
1,2,Wanderlust,Driven by a yearning to explore new cultures a...
2,3,Technophile,Embraces the latest technological advancements...
3,4,Culinary Artist,"Transforms cooking into an art form, experimen..."
4,5,Bookworm,"Devours books of all genres, finding solace an..."
5,6,Animal Advocate,"Champions the well-being of animals, volunteer..."
6,7,History Buff,"Fascinated by the past, spending countless hou..."
7,8,Musician,"Expresses themself through the power of music,..."
8,9,Fitness Enthusiast,"Maintains an active lifestyle, incorporating r..."
9,10,Movie Critic,"Holds a deep appreciation for cinema, analyzin..."


In [13]:
# Function for sentiment analysis using TextBlob
def calculate_sentiment(bio):
    analysis = TextBlob(bio)
    return analysis.sentiment.polarity

In [14]:
# Tokenization and stemming using NLTK
def tokenize_and_stem(bio):
    tokens = word_tokenize(bio)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [15]:
# Apply sentiment analysis, tokenization, and stemming
df['sentiment_score'] = df['bio'].apply(calculate_sentiment)
df['tokenized_and_stemmed'] = df['bio'].apply(tokenize_and_stem)

In [16]:
# Calculate average sentiment score
average_sentiment = df['sentiment_score'].mean()

In [17]:
# Categorize data into 'above_average' and 'below_average'
df['category'] = df['sentiment_score'].apply(lambda x: 'above_average' if x >= average_sentiment else 'below_average')

In [18]:
# Sort the DataFrame based on sentiment scores
df_sorted = df.sort_values(by='sentiment_score')

In [19]:
# Select the top 20% and bottom 20% of the dataset
top_20_percent = df_sorted.tail(int(0.2 * len(df_sorted)))
bottom_20_percent = df_sorted.head(int(0.2 * len(df_sorted)))

In [20]:
import string
import spacy
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [21]:
# Function for POS tagging
def pos_tagging(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract POS tags
    pos_tags = [(token.text, token.pos_) for token in doc]

    return pos_tags

# Apply POS tagging to the 'text' column
df['pos_tags'] = df['bio'].apply(pos_tagging)

# Display the DataFrame with original text and POS tags
print(df[['bio', 'pos_tags']])

                                                  bio  \
0   Thrives in the company of plants, finds nurtur...   
1   Driven by a yearning to explore new cultures a...   
2   Embraces the latest technological advancements...   
3   Transforms cooking into an art form, experimen...   
4   Devours books of all genres, finding solace an...   
5   Champions the well-being of animals, volunteer...   
6   Fascinated by the past, spending countless hou...   
7   Expresses themself through the power of music,...   
8   Maintains an active lifestyle, incorporating r...   
9   Holds a deep appreciation for cinema, analyzin...   
10  Enjoys getting lost in a good book and explori...   
11  Passionate about all things tech, loves keepin...   
12  Finds peace and joy in spending time outdoors,...   
13  Expresses creativity through music, enjoys pla...   
14  Loves trying new cuisines and experimenting wi...   
15  Expresses themselves through various art forms...   
16  Enjoys the challenge and co

In [22]:
# Display the results

In [23]:
print("Original DataFrame:")
print(df)

Original DataFrame:
    user_id            username  \
0         1     Plant Whisperer   
1         2          Wanderlust   
2         3         Technophile   
3         4     Culinary Artist   
4         5            Bookworm   
5         6     Animal Advocate   
6         7        History Buff   
7         8            Musician   
8         9  Fitness Enthusiast   
9        10        Movie Critic   
10       11            Bookworm   
11       12             TechWiz   
12       13         NatureLover   
13       14            Musician   
14       15              Foodie   
15       16              Artist   
16       17               Gamer   
17       18         BookClubber   
18       19           MovieBuff   
19       20          FitnessFan   
20       21           Stargazer   
21       22           Wordsmith   
22       23         Eco-Warrior   
23       24       Hobbyist Chef   
24       25        Tech Creator   

                                                  bio  sentiment_scor

In [24]:
print("DataFrame Sorted by Sentiment Score:")
print(df_sorted)

DataFrame Sorted by Sentiment Score:
    user_id            username  \
6         7        History Buff   
1         2          Wanderlust   
16       17               Gamer   
13       14            Musician   
20       21           Stargazer   
4         5            Bookworm   
17       18         BookClubber   
18       19           MovieBuff   
15       16              Artist   
9        10        Movie Critic   
14       15              Foodie   
22       23         Eco-Warrior   
7         8            Musician   
3         4     Culinary Artist   
2         3         Technophile   
11       12             TechWiz   
8         9  Fitness Enthusiast   
19       20          FitnessFan   
10       11            Bookworm   
24       25        Tech Creator   
12       13         NatureLover   
21       22           Wordsmith   
5         6     Animal Advocate   
0         1     Plant Whisperer   
23       24       Hobbyist Chef   

                                                  bi

In [25]:
print("Top 20% Above Average:")
print(top_20_percent)

Top 20% Above Average:
    user_id         username  \
12       13      NatureLover   
21       22        Wordsmith   
5         6  Animal Advocate   
0         1  Plant Whisperer   
23       24    Hobbyist Chef   

                                                  bio  sentiment_score  \
12  Finds peace and joy in spending time outdoors,...         0.400000   
21  Captivated by the power of language, weaving w...         0.500000   
5   Champions the well-being of animals, volunteer...         0.500000   
0   Thrives in the company of plants, finds nurtur...         0.500000   
23  Enjoys experimenting in the kitchen, trying ne...         0.500606   

                                tokenized_and_stemmed       category  
12  [find, peac, and, joy, in, spend, time, outdoo...  above_average  
21  [captiv, by, the, power, of, languag, ,, weav,...  above_average  
5   [champion, the, well-b, of, anim, ,, volunt, a...  above_average  
0   [thrive, in, the, compani, of, plant, ,, find,...  

In [26]:
print("Bottom 20% Below Average:")
print(bottom_20_percent)

Bottom 20% Below Average:
    user_id      username                                                bio  \
6         7  History Buff  Fascinated by the past, spending countless hou...   
1         2    Wanderlust  Driven by a yearning to explore new cultures a...   
16       17         Gamer  Enjoys the challenge and competition of video ...   
13       14      Musician  Expresses creativity through music, enjoys pla...   
20       21     Stargazer  Gazes at the night sky with wonder and fascina...   

    sentiment_score                              tokenized_and_stemmed  \
6         -0.050000  [fascin, by, the, past, ,, spend, countless, h...   
1         -0.015152  [driven, by, a, yearn, to, explor, new, cultur...   
16         0.000000  [enjoy, the, challeng, and, competit, of, vide...   
13         0.000000  [express, creativ, through, music, ,, enjoy, p...   
20         0.000000  [gaze, at, the, night, sky, with, wonder, and,...   

         category  
6   below_average  
1   belo

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [28]:
# User input (replace this with the user's actual input)
user_input = "Driven by a yearning to explore new cultures and landscapes."


In [29]:
# Function for POS tagging
def pos_tagging(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract POS tags
    pos_tags = [(token.text, token.pos_) for token in doc]

    return pos_tags

# Apply POS tagging to the 'text' column
df['pos_tags'] = df['bio'].apply(pos_tagging)

# User input (replace this with the user's actual input)
user_input = "competition"

# Preprocess data for vectorization
corpus = df['bio'].astype(str)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Tokenization, stemming, and POS tagging for user input
user_pos_tags = pos_tagging(user_input)
user_vector = vectorizer.transform([' '.join([word for word, pos in user_pos_tags])])

# Calculate cosine similarities
cosine_similarities = linear_kernel(user_vector, tfidf_matrix).flatten()

# Get recommendations based on similarity
recommended_indices = cosine_similarities.argsort()[:-4:-1]  # Get top 3 recommendations
recommendations = df.iloc[recommended_indices]

print("User Input:", user_input)
print("\nRecommended Topics:")
print(recommendations[['username', 'bio']])


User Input: competition

Recommended Topics:
        username                                                bio
16         Gamer  Enjoys the challenge and competition of video ...
24  Tech Creator  Shares their passion for technology through on...
11       TechWiz  Passionate about all things tech, loves keepin...


In [34]:
# Function for counting occurrences of a specific word
def count_word_occurrences(pos_tags, target_word):
    # Count occurrences of the target word
    count = sum(1 for word, _ in pos_tags if word.lower() == target_word.lower())
    return count

# Assuming you have a DataFrame 'df' with a column 'pos_tags'
# Specify the target word
target_word = "in"

# Apply the count_word_occurrences function to the 'pos_tags' column
df['word_occurrences'] = df['pos_tags'].apply(lambda tags: count_word_occurrences(tags, target_word))

# Display the DataFrame with word occurrences
print(df[['bio', 'pos_tags', 'word_occurrences']])


                                                  bio  \
0   Thrives in the company of plants, finds nurtur...   
1   Driven by a yearning to explore new cultures a...   
2   Embraces the latest technological advancements...   
3   Transforms cooking into an art form, experimen...   
4   Devours books of all genres, finding solace an...   
5   Champions the well-being of animals, volunteer...   
6   Fascinated by the past, spending countless hou...   
7   Expresses themself through the power of music,...   
8   Maintains an active lifestyle, incorporating r...   
9   Holds a deep appreciation for cinema, analyzin...   
10  Enjoys getting lost in a good book and explori...   
11  Passionate about all things tech, loves keepin...   
12  Finds peace and joy in spending time outdoors,...   
13  Expresses creativity through music, enjoys pla...   
14  Loves trying new cuisines and experimenting wi...   
15  Expresses themselves through various art forms...   
16  Enjoys the challenge and co