# AI APPS SENTIMENT ANALYSIS

In [1]:
! pip install google-play-scraper pandas numpy scikit-learn matplotlib seaborn wordcloud nltk textblob




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
import os
from datetime import datetime
from typing import List, Dict, Tuple, Optional

warnings.filterwarnings('ignore')

# Web Scraping
from google_play_scraper import app, Sort, reviews

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from textblob import TextBlob

# Statistical Analysis
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, ttest_ind, mannwhitneyu

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import (
    mean_squared_error, r2_score, mean_absolute_error,
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation

# Utilities
from tqdm import tqdm
import joblib

# Download NLTK data
for resource in ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger']:
    nltk.download(resource, quiet=True)

# Visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
pd.set_option('display.max_columns', 50)

print("ALL LIBRARIES LOADED SUCCESSFULLY")
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

ALL LIBRARIES LOADED SUCCESSFULLY
Analysis Date: 2025-12-18 19:03:06


In [3]:
# ai app configuration
AI_APPS_CONFIG = {
    'gemini': {
        'app_id'    : 'com.google.android.apps.bard',
        'name'      : 'Google Gemini AI',
        'company'   : 'Google',
        'category'  : 'Conversational/multimodal LLM',
        'color'     : '#4285f4'
        },
    'chatgpt': {
        'app_id'    : 'com.openai.chatgpt',
        'name'      : 'ChatGPT',
        'company'   : 'OpenAI',
        'category'  : 'Conversational LLM',
        'color'     : '#000000'
    },
    'claude': {
        'app_id'    : 'com.anthropic.claude',
        'name'      : 'ClaudeAI',
        'company'   : 'Anthropic',
        'category'  : 'Conversational LLM',
        'color'     : '#D97757'
    },
    'grok': {
        'app_id'    : 'ai.x.grok',
        'name'      : 'Grok',
        'company'   : 'xAI Corporation',
        'category'  : 'Conversational/Humorius LLM',
        'color'     : '#333333'
    },
    'Copilot': {
        'app_id'    : 'com.microsoft.copilot',
        'name'      : 'Microsoft Copilot: AI Chat',
        'company'   : 'Microsoft Corporation',
        'category'  : 'Intergrative/Conversational LLM',
        'color'     : '#F25022'
    },
    'perplexity': {
        'app_id'    : 'ai.perplexity.app.android',
        'name'      : 'Perplexity - Ask Anything',
        'company'   :'Perplexity AI, inc.',
        'category'  : 'Search/Answer Enggine LLM',
        'color'     : '#22B8CF'
    },
    'Poe': {
        'app_id'    : 'com.quora.poe',
        'name'      : 'Poe - Fast AI Chat',
        'company'   : 'Quora, Inc',
        'category'  : 'Aggregator/Bot Hosting',
        'color'     : '#582696'
    },
    'Qwen': {
        'app_id'    : 'ai.qwenlm.chat.android',
        'name'      : 'Qwen Char',
        'company'   : 'Alibaba Cloud',
        'category'  : 'Conversational LLM (CN)',
        'color'     : '#4D6BFE'
    },
    'Deepseek': {
        'app_id'    : 'com.deepseek.chat',
        'name'      : 'Deepseek - AI Assistant',
        'company'   : 'Deepseek AI',
        'category'  : 'Conversational/Coding LLM',
        'color'     : '#4D6BFE'
    },
    'Otterai': {
        'app_id'    : 'com.ainote.flow',
        'name'      : 'Otter: AI Meeting Notes',
        'company'   : 'AISense Inc',
        'category'  : 'Transcription/Productivity',
        'color'     : '#353D57'
    },
    'Blaxboxai': {
        'app_id'    : 'com.blackbox.ai',
        'name'      : 'BlackBox AI & Code Char',
        'company'   : 'Blackbox AI',
        'category'  : 'Coding Assistant/Developer',
        'color'     : '#111111'
    },
    'Meta': {
        'app_id'    : 'com.facebook.stella',
        'name'      : 'Meta AI - Vibes & AI Glasses',
        'company'   : 'Meta Platforms, Inc.',
        'category'  : 'Conversational/Intergrated Reality',
        'color'     : '#0064E0'
    },
    'characterai': {
        'app_id'    : 'ai.character.app',
        'name'      : 'Character.ai',
        'company'   : 'Character.ai',
        'category'  : 'Roleplay/Entertainment LLM',
        'color'     : '#007AFF'  
    },
    'pi': {
        'app_id'    : 'ai.inflection.pi',
        'name'      : 'Pi, your personal AI',
        'company'   : 'Inflection AI',
        'category'  : 'Emotional Support/Personal Assistant',
        'color'     : '#F4EBD0'  
    },
    'grammarly': {
        'app_id'    : 'com.grammarly.android.keyboard',
        'name'      : 'Grammarly - AI Writing',
        'company'   : 'Grammarly, Inc.',
        'category'  : 'Writing Assistant/Correction',
        'color'     : '#15C39A'  
    },
    'deepl': {
        'app_id'    : 'com.deepl.mobiletranslator',
        'name'      : 'DeepL Translate',
        'company'   : 'DeepL SE',
        'category'  : 'AI Translation',
        'color'     : '#0F2B46' 
    },
    'socratic': {
        'app_id'    : 'com.google.socratic',
        'name'      : 'Socratic by Google',
        'company'   : 'Google',
        'category'  : 'Education/Homework Helper',
        'color'     : '#FF6D00'  
    },
    'leonardo': {
        'app_id'    : 'ai.leonardo.app',
        'name'      : 'Leonardo.ai - Image Generator',
        'company'   : 'Leonardo.Ai',
        'category'  : 'Image Generation/Art',
        'color'     : '#9B51E0'  
    }
}

# scraping configuration
SCRAPING_CONFIG = {
    'countries': ['us', 'gb', 'in', 'id', 'jp', 'br', 'fr', 'ca', 'au'],
    'reviews_per_app' : 10000,
    'language': 'en'
}

# dir 
for dir_name in ['data', 'models', 'output', 'reports']:
    os.makedirs(dir_name, exist_ok=True)

print(f"ÄPP configure : {len(AI_APPS_CONFIG)}")
print(f"countires : {len(SCRAPING_CONFIG['countries'])}")
print(f"target reviews : {SCRAPING_CONFIG['reviews_per_app']}")

ÄPP configure : 18
countires : 9
target reviews : 10000


In [4]:
# collecting data
class PlayStoreScraper:
    """
    Production-grade Google Play Store scraper.
    
    Features:
    - Multi-country scraping
    - Error handling and retry logic
    - Progress tracking
    - App metadata collection
    """
    def __init__(self, apps_config: Dict, scraping_config: Dict):
        self.apps = apps_config
        self.countries = scraping_config['countries']
        self.reviews_per_app = scraping_config['reviews_per_app']
        self.language = scraping_config['language']
    

    def get_app_info(self, app_id: str, country: str = 'us') -> Optional[Dict]:
        """Fetch metadata"""
        try:
            info = app(app_id, lang=self.language, country=country)
            return {
                'app_id': app_id,
                'title': info.get('title'),
                'rating': info.get('score'),
                'reviews_count': info.get('reviews'),
                'installs': info.get('installs'),
                'developer': info.get('developer'),
                'last_updated': info.get('updated'),
                'version': info.get('version'),
                'size': info.get('size'),
                'content_rating': info.get('contentRating')
            }
        except Exception as e:
            print(f"Error fetching {app_id} : {e}")
            return None

    def scrape_reviews(self, app_key: str, app_config: Dict) -> pd.DataFrame:
        """Scrape reviews for a single app from multiple countries"""
        all_reviews = []
        reviews_per_country = self.reviews_per_app // len(self.countries)

        for country in self.countries:
            try:
                result, _ = reviews(
                    app_config['app_id'],
                    lang=self.language,
                    country=country,
                    sort=Sort.NEWEST,
                    count=reviews_per_country
                )

                for r in result:
                    r['app_key']    = app_key 
                    r['app_name']   = app_config['name']      
                    r['company']    = app_config['company']   
                    r['category']   = app_config['category']
                    r['country']    = country.upper() 

                all_reviews.extend(result)
            except:
                print(f" {country.upper()} : {e}")
            
        return pd.DataFrame(all_reviews)

    def scrape_all(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Scrape all apps and return reviews + app info"""
        print("colleccting data.....")

        all_reviews = []
        app_info_list = []

        for app_key, app_config in tqdm(self.apps.items(), desc="Scraping apps"):
            print(f"\n {app_config['name']}.....")

            # getr app info
            info = self.get_app_info(app_config['app_id'])
            if info:
                info['app_key'] = app_key
                info['company'] = app_config['company']
                app_info_list.append(info)
                print(f" rating : {info['rating']:.2f} | reviews : {info['reviews_count']:,}")
            
            # get review
            df_reviews = self.scrape_reviews(app_key, app_config)
            if len(df_reviews) > 0:
                all_reviews.append(df_reviews)
                print(f"   collected: {len(df_reviews):,} reviews")
        
        df_all_reviews = pd.concat(all_reviews, ignore_index=True) if all_reviews else pd.DataFrame()
        df_app_info = pd.DataFrame(app_info_list)

        print("Collection completed")
        

        return df_all_reviews, df_app_info

print("playstore scraper done")

playstore scraper done


In [5]:
scraper = PlayStoreScraper(AI_APPS_CONFIG, SCRAPING_CONFIG)
df_raw, df_app_info = scraper.scrape_all()

# save raw data
df_raw.to_csv('data/raw_reviews.csv', index=False)
df_app_info.to_csv('data/app_info.csv', index=False)
print("saved to data/raw_reviews.csv")

colleccting data.....


Scraping apps:   0%|          | 0/18 [00:00<?, ?it/s]


 Google Gemini AI.....
 rating : 4.54 | reviews : 103,221


Scraping apps:   6%|▌         | 1/18 [00:13<03:53, 13.73s/it]

   collected: 9,999 reviews

 ChatGPT.....
 rating : 4.75 | reviews : 131,240


Scraping apps:  11%|█         | 2/18 [00:27<03:39, 13.71s/it]

   collected: 9,999 reviews

 ClaudeAI.....
 rating : 4.57 | reviews : 5,224


Scraping apps:  17%|█▋        | 3/18 [00:41<03:27, 13.83s/it]

   collected: 9,999 reviews

 Grok.....
 rating : 4.88 | reviews : 24,173


Scraping apps:  22%|██▏       | 4/18 [00:52<03:00, 12.91s/it]

   collected: 9,999 reviews

 Microsoft Copilot: AI Chat.....
 rating : 4.75 | reviews : 28,195


Scraping apps:  28%|██▊       | 5/18 [01:04<02:40, 12.37s/it]

   collected: 9,999 reviews

 Perplexity - Ask Anything.....
 rating : 4.66 | reviews : 11,078


Scraping apps:  33%|███▎      | 6/18 [01:16<02:27, 12.26s/it]

   collected: 9,999 reviews

 Poe - Fast AI Chat.....
Error fetching com.quora.poe : App not found(404).


Scraping apps:  39%|███▉      | 7/18 [01:18<01:39,  9.08s/it]


 Qwen Char.....
 rating : 4.04 | reviews : 29


Scraping apps:  44%|████▍     | 8/18 [01:28<01:31,  9.13s/it]

   collected: 7,272 reviews

 Deepseek - AI Assistant.....
 rating : 4.15 | reviews : 5,519


Scraping apps:  50%|█████     | 9/18 [01:40<01:32, 10.26s/it]

   collected: 9,999 reviews

 Otter: AI Meeting Notes.....
Error fetching com.ainote.flow : App not found(404).


Scraping apps:  56%|█████▌    | 10/18 [01:43<01:02,  7.86s/it]


 BlackBox AI & Code Char.....
Error fetching com.blackbox.ai : App not found(404).


Scraping apps:  61%|██████    | 11/18 [01:45<00:43,  6.22s/it]


 Meta AI - Vibes & AI Glasses.....
 rating : 4.65 | reviews : 5,251


Scraping apps:  67%|██████▋   | 12/18 [01:56<00:45,  7.56s/it]

   collected: 9,999 reviews

 Character.ai.....
 rating : 3.66 | reviews : 66,378


Scraping apps:  72%|███████▏  | 13/18 [02:09<00:45,  9.07s/it]

   collected: 9,999 reviews

 Pi, your personal AI.....
 rating : 3.72 | reviews : 922


Scraping apps:  78%|███████▊  | 14/18 [02:21<00:39, 10.00s/it]

   collected: 9,999 reviews

 Grammarly - AI Writing.....
 rating : 4.17 | reviews : 25,570


Scraping apps:  83%|████████▎ | 15/18 [02:33<00:31, 10.64s/it]

   collected: 9,999 reviews

 DeepL Translate.....
 rating : 4.66 | reviews : 2,688


Scraping apps:  89%|████████▉ | 16/18 [02:45<00:22, 11.13s/it]

   collected: 9,999 reviews

 Socratic by Google.....
Error fetching com.google.socratic : App not found(404).


Scraping apps:  94%|█████████▍| 17/18 [02:56<00:11, 11.16s/it]

   collected: 9,999 reviews

 Leonardo.ai - Image Generator.....
Error fetching ai.leonardo.app : App not found(404).


Scraping apps: 100%|██████████| 18/18 [02:59<00:00,  9.97s/it]


Collection completed
saved to data/raw_reviews.csv


In [6]:
print(f"shape : {df_raw.shape}")
print(f"\n columns : {list(df_raw.columns)[:10]}...")
print(f"review per app")
print(df_raw['app_name'].value_counts())

shape : (137259, 16)

 columns : ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt']...
review per app
app_name
Google Gemini AI                9999
ChatGPT                         9999
ClaudeAI                        9999
Grok                            9999
Microsoft Copilot: AI Chat      9999
Perplexity - Ask Anything       9999
Deepseek - AI Assistant         9999
Meta AI - Vibes & AI Glasses    9999
Character.ai                    9999
Pi, your personal AI            9999
Grammarly - AI Writing          9999
DeepL Translate                 9999
Socratic by Google              9999
Qwen Char                       7272
Name: count, dtype: int64


In [7]:
# cleaning
class DataCleaner:
    def __init__(self, df:pd.DataFrame):
        self.df = df.copy()
        self.stats = {'initial_rows' : len(df)}
    
    def remove_duplicates(self) -> 'DataCleaner':
        before = len(self.df)
        self.df = self.df.drop_duplicates(subset=['reviewId'], keep='first')
        self.stats['duplicates_removed'] = before - len(self.df)
        return self
    
    def handle_missing_values(self)-> 'DataCleaner':
        self.df['content'] = self.df['content'].fillna('')
        self.df['thumbsUpCount'] = self.df['thumbsUpCount'].fillna(0).astype(int)
        self.df['replyContent'] = self.df['replyContent'].fillna('')
        self.stats['missing_content_count'] = (self.df['content'] == '').sum()
        return self
    
    def filter_empty_reviews(self, min_chars : int = 5 ) -> 'DataCleaner':
        before = len(self.df)
        self.df = self.df[self.df['content'].str.len() >= min_chars]
        self.stats['empty_removed'] = before - len(self.df)
        return self

    def standardize_columns(self) -> 'DataCleaner':
        column_map = {
            'reviewId' : 'review_id',
            'userName' : 'user_name',
            'content' : 'review_text',
            'score' : 'rating',
            'thumbsUpCount' : 'thumbs_up',
            'at' : 'review_date',
            'replyContent' : 'developer_reply',
            'repliedAt' : 'reply_date',
            'appVersion' : 'app_version'
        }

        cols_to_keep = [c for c in column_map.keys() if c in self.df.columns]
        cols_to_keep.extend(['app_key', 'app_name', 'company', 'category', 'country'])

        self.df = self.df[[c for c in cols_to_keep if c in self.df.columns]]
        self.df = self.df.rename(columns={k: v for k, v in column_map.items() if k in self.df.columns})

        self.df['review_date'] =pd.to_datetime(self.df['review_date'])
        self.df['rating'] = self.df['rating'].astype(int)

        return self
    
    def validate(self) -> 'DataCleaner':
        """Run data quality validations."""
        validations = {
            'rating_range_valid': self.df['rating'].between(1, 5).all(),
            'no_null_app_names': self.df['app_name'].notna().all(),
            'dates_valid': self.df['review_date'].notna().all()
        }
        self.stats['validations'] = validations
        return self
    
    def get_result(self) -> Tuple[pd.DataFrame, Dict]:
        self.stats['final_rows'] = len(self.df)
        self.stats['row_removed'] = self.stats['initial_rows'] - self.stats['final_rows']
        return self.df, self.stats
    
cleaner = DataCleaner(df_raw)
df, cleaning_stats = (
    cleaner
    .handle_missing_values()
    .filter_empty_reviews(min_chars=5)
    .standardize_columns()
    .validate()
    .get_result()
)

for key, value in cleaning_stats.items():
    print(f"{key}: {value}")
print(f"clean dataset : {len(df):,} reviews")


initial_rows: 137259
missing_content_count: 0
empty_removed: 15579
validations: {'rating_range_valid': True, 'no_null_app_names': True, 'dates_valid': True}
final_rows: 121680
row_removed: 15579
clean dataset : 121,680 reviews


In [14]:
class FE:
    def __init__(self,df : pd.DataFrame):
        self.df = df.copy()
        self.stop_word = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.feature_group = {}

        def add_temporal_features(self) -> 'FE':
            dt = self.df['review_date']
            self.df['year'] = dt.dt.year
            self.df['month'] = dt.dt.month
            self.df['day'] = dt.dt.day
            self.df['dayofweek'] = dt.dt.dayofweek
            self.df['dayofweek_name'] = dt.dt.day_name()
            self.df['hour'] = dt.dt.hour
            self.df['quarter'] = dt.dt.quarter
            self.df['is_weekend'] = dt.dt.dayofweek.isin([5,6]).astype(int)
            self.df['is_month_start'] = dt.dt.is_month_start.astype(int)
            self.df['is_month_end'] = dt.dt.is_month_end.astype(int)
            self.df['days_since_review'] = (datetime.now() - dt).dt.days

            self.feature_group['temporal'] = 11
            return self
        
        def add_text_length_features(self) -> 'FE':
            text = self.df['review_text']

            self.df['char_count'] = text.str.len()
            self.df['word_count'] = text.str.split().str.len()
            self.df['sentence_count'] = text.str.count(r'.!?+').clip(lower=1)
            self.df['avg_word_length'] = self.df['char_count'] / (self.df['word_count'] + 1)
            self.df['avg_sentence_lenght'] = self.df['word_count'] / self.df['sentence_count']
            self.df['unique_word_count'] = text.apply(lambda x: len(set(str(x).lower().split)))
            self.df['unique_word_ratio'] = self.df['unique_word_count'] / (self.df['word_count'] + 1)

            # lenght categories
            self.df['length_category'] = pd.cut(
                self.ddf['word_count'],
                bins=[0, 10, 30, 100, 1000],
                labels=['very_short', 'short', 'medium', 'long']
            )

            self.feature_group['text_length'] = 8
            return self
        
        def add_text_pattern_features(self) -> 'FE':
            text = self.df['review_text']

            self.df['exclamation_count'] = text.str.count('!') 
            self.df['question_count'] =  text.str.count(r'\?')
            self.df['uppercase_count'] = text.apply(lambda x: sum(1 for c in str(x) if c.isupper()))
            self.df['uppercase_ratio'] = self.df['uppercase_count'] / (self.df['char_count'] + 1)
            self.df['digit_count'] =  text.str.count(r'\d')
            self.df['special_chat_count'] = text.str.count(r'[^a-zA-Z0-9\s]')
            self.df['emoji_count'] = text.apply(lambda x: len(re.findall(r'[\U0001F600-\U0001F64F]', str(x))))
            self.df['has_url'] = text.str.contains(r'http[s]?://', regex=True).astype(int)
            self.df['has_email'] = text.str.contains(r'\S+@\S+', regex=True).astype(int)
            self.df['has_mention'] = text.str.contains(r'@\w+', regex=True).astype(int)
            self.df['has_hastag'] = text.str.contains(r'#\w+', regrex=True).astype(int)
            self.df['all_caps_word_count'] = text.apply(lambda x: sum(1 for w in str(x).split()if w.isupper() and len(w) > 1))

            self.feature_groups['text_patterns'] = 12
            return self
        
        # sentiment features
        def add_sentiment_features(self) -> 'FE':
            """Calculate sentiment scores using texblob"""
            def get_sentiment(text):
                try:
                    blob = TextBlob(str(text))
                    return blob.sentiment.polarity, blob.sentiment.subjectivity
                except:
                    return 0.0, 0.0
            
            sentiments = self.df['review_text'].apply(get_sentiment)
            self.df['polarity'] = sentiments.apply(lambda x: x[0])
            self.df['subjectivity'] = sentiments.apply(lambda x: x[1])

            # derived sentiment features
            self.df['polarity_abs'] = self.df['polarity'].abs()
            self.df['is_positive_polarity'] = (self.df['polarity'] > 0).astypes(int)
            self.df['is_negative_polarity'] = (self.df['polarity'] < 0).astypes(int)
            self.df['is_neutral_polarity'] = (self.df['polarity'] == 0).astypes(int)
            self.df['is_subjective'] = (self.df['subjectivity'] >  0.5).astypes(int)
            self.df['polarity_subjectivity_ratio'] = self.df['polarity_abs'] / (self.df['subjectivity'] + 0.01)

            self.features_groups['sentiment'] = 8
            return self
        
        # label features
        def add_label_features(self) -> 'FE':
            """Create target label and derived features"""
            # sentiment from rating
            self.df['sentiment_label'] = self.df['rating'].apply(
                lambda x: 'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral')
            )
            self.df['sentiment_binary'] = (self.df['rating'] >= 4).astype(int)
            self.df['sentiment_ternary'] = self.df['rating'].apply(
                lambda x:2 if x >= 4 else (0 if x <=2 else 1)
            )

            # rating based features
            self.df['is_extreme_rating'] = self.df['rating'].isin([1,5]).astype(int)
            self.df['is_perfect_rating'] = self.df(['rating'] == 5).astype(int)
            self.df['is_perfect_rating'] = self.df(['rating'] == 1).astype(int)

            self.feature_group['labels'] = 6
            return self
        
        # text cleaning
        def add_cleaned_text(self) -> 'FE':
            def clean_text(text):
                text = str(text).lower()
                text = re.sub(r'http\S+|www\S+', '', text) # url
                text = re.sub(r'\S+@\S+', '', text) # email
                text = re.sub(r'[a-zA-Z\s]', '', text) # specia; char
                text = ''.join(text.split()) # nornmalize whitespace
                return text
            
            def remove_stepword(text):
                words = text.split()
                return ' '.join([self.lemmatizer.lemmatize(w) for w in words])
            
            def lemmatize(text):
                words = text.split()
                return ' '.join([self.lemmatizer.lemmatize(w) for w in words])
            
            self.df['clean_text'] = self.df['review_text'].apply(clean_text)
            self.df['text_no_stopwords'] = self.df['clean_text'].apply(remove_stepword)
            self.df['processed_text'] = self.df['text_no_stopwords'].apply(lemmatize)
            self.df['processed_word_count'] = self.df['processed_text'].str.split().str.len()

            self.feature_groups['text_cleaning'] = 4
            return self
        
        # keyword feature 
        def add_keyword_features(self) -> 'FE':
            """Counting sentiment-realted keywords"""
            positive_word = ['love', 'beautifull', 'helpfull', 'amazing',
                             'great', '5/10', 'fantastic', 'useful', 'best',
                             'awesome', 'excellent', 'perfect', 'superb']
            negative_word = ['bitch', 'stupid', 'annoying', 'hate', 'terrible', 'worst',
                              'bad','useless', 'trash', 'garbage', 'disappointing', 'slow', 'crash',
                         'bug', 'broken', 'waste', 'poor', 'annoying', 'frustrating' ]
            ai_words = ['ai', 'gpt', 'llm', 'rag', 'agi', 'model', 'response', 'answer', 'conversation',
                   'accurate', 'smart', 'intelligent', 'understand', 'language']
            
        def count_words(text, word_list):
            text_lower = str(text).lower()
            return sum(1 for w in word_list if w in text_lower )
        
        self.df['positive_word_count'] = self.df['review_text'].apply(lambda x: count_words(x, positive_word))
        self.df['negative_word_count'] = self.df['review_text'].apply(lambda x: count_words(x. negative_word))
        self.df['ai_word_count'] = self.df['review_Text'].apply(lambda x: count_words(x, ai_word) )
        self.df['keyword_sentiment_score'] = self.df['review_Text'] - self.df['negative_word_count']
        self.df['has_positive_words'] = (self.df['positive_Word_count'] > 0).astype(int)
        self.df['has_negative_words'] = (self.df['negative_Word_count'] > 0).astype(int)

        self.feature_group['keywords'] = 6
        return self
        

            

    


            