<a href="https://colab.research.google.com/github/mamta85/Git-hub-tutorial/blob/master/Untitled31updated%20tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Step 1: Import required libraries
from google.colab import files
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')

# Step 2: Upload CSV file
uploaded = files.upload()

# Step 3: Read CSV file into DataFrame
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

# Step 4: Check your columns to identify the review text column
print("Columns in dataset:", df.columns.tolist())

# Assuming your review text column is 'text_en' (change if different)
review_col = 'text_en'

# Step 5: Preprocessing function (you can customize this)
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()                     # lowercase
    text = re.sub(r'[^a-z\s]', '', text)         # remove non-alphabetic chars
    tokens = word_tokenize(text)                  # tokenize
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]  # remove stopwords and short words
    return " ".join(tokens)

# Step 6: Apply preprocessing on reviews
df['processed_text'] = df[review_col].apply(preprocess_text)

# Step 7: Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=5, max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])
feature_names = tfidf_vectorizer.get_feature_names_out()

# Step 8: Aggregate TF-IDF scores across all reviews to find top features
tfidf_sum = np.sum(tfidf_matrix.toarray(), axis=0)
tfidf_scores = pd.DataFrame({'feature': feature_names, 'tfidf_sum': tfidf_sum})
tfidf_scores = tfidf_scores.sort_values(by='tfidf_sum', ascending=False)

print("\nTop 20 important features/words across all reviews:")
print(tfidf_scores.head(20))

# Step 9 (Optional): Function to get top N TF-IDF words per review
def get_top_tfidf_words(row, features, top_n=5):
    row_data = row.toarray().flatten()
    top_indices = row_data.argsort()[-top_n:][::-1]
    top_words = [(features[i], row_data[i]) for i in top_indices]
    return top_words

# Example: print top words for first review
print("\nTop words in first review:")
print(get_top_tfidf_words(tfidf_matrix[0], feature_names, top_n=5))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving cleaned_reviews.csv to cleaned_reviews (1).csv
Columns in dataset: ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion', 'text_en', 'sentiment', 'cleaned_review']

Top 20 important features/words across all reviews:
    feature  tfidf_sum
3      good  17.143660
6      nice   9.501174
2  facebook   8.799361
1       app   7.662812
0   account   5.595074
4     great   5.000000
5      like   3.713641
7       one   3.713641

Top words in first review:
[('great', 1.0), ('one', 0.0), ('nice', 0.0), ('like', 0.0), ('good', 0.0)]
