In [1]:
!pip install textblob emoji pandas
!python -m textblob.download_corpora  # For sentiment analysis

Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Mohan244643\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Mohan244643\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohan244643\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Mohan244643\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\Mohan244643\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Mohan244643\AppData\Roaming\nltk_data...
[nltk_

In [2]:
!where python
!where pip

C:\Users\Mohan244643\AppData\Local\anaconda3\python.exe
C:\Users\Mohan244643\AppData\Local\Programs\Python\Python313\python.exe
C:\Users\Mohan244643\AppData\Local\Microsoft\WindowsApps\python.exe
C:\Users\Mohan244643\AppData\Local\anaconda3\Scripts\pip.exe
C:\Users\Mohan244643\AppData\Local\Programs\Python\Python313\Scripts\pip.exe


In [3]:
from textblob import TextBlob
import emoji
from datetime import datetime, timedelta

def calculate_post_score(post):
    """
    Calculate a score (0-100) for ranking posts with enhanced Hinglish and emoji support.
    Input: Post = {
        "text": str,
        "likes": int,
        "comments": int,
        "author_content_watched": int,
        "author_reviews_posted": int,
        "author_public_watchlists": int,
        "media_count": int,
        "created_at": datetime,
    }
    """
    # (1) Engagement Score (50% weight)
    engagement = (
        0.6 * min(post["likes"] / 100, 1.0) + 
        0.4 * min(post["comments"] / 50, 1.0)
    )

    # (2) Content Quality (30% weight)
    ## Enhanced Sentiment Analysis (TextBlob + Emoji)
    sentiment = TextBlob(post["text"]).sentiment.polarity  # -1 to 1
    emoji_score = calculate_emoji_sentiment(post["text"])
    combined_sentiment = (sentiment + emoji_score + 1) / 2  # Convert to 0-1 scale

    ## Enhanced Hinglish Movie Relevance
    movie_mentioned_score = check_movie_relevance(post["text"])

    ## Media Boost (Images/Videos)
    media_boost = min(post["media_count"] / 4, 1.0)

    content = (
        0.5 * combined_sentiment +
        0.3 * movie_mentioned_score +
        0.2 * media_boost
    )

    # (3) Author Reputation (10% weight) - unchanged
    author = (
        0.4 * min(post["author_content_watched"] / 10000, 1.0) +  
        0.3 * min(post["author_reviews_posted"] / 1000, 1.0) +
        0.3 * min(post["author_public_watchlists"] / 1000, 1.0)
    )

    # (4) Time Decay (10% weight) - unchanged
    hours_old = (datetime.now() - post["created_at"]).total_seconds() / 3600
    decay = 0.5 ** (hours_old / 48)  # Halflife = 2 days

    # Final Weighted Score (0-100)
    raw_score = (0.5 * engagement + 0.3 * content + 0.1 * author) * 100
    final_score = decay * raw_score

    return round(final_score, 1)

def calculate_emoji_sentiment(text):
    """Calculate sentiment from emojis (returns value between -0.5 to +0.5)"""
    emoji_sentiment_map = {
        # Positive emojis
        '❤️': 0.3, '👌': 0.3, '👍': 0.2, '😍': 0.3, '🔥': 0.2,
        '🎉': 0.2, '🤩': 0.3, '🙌': 0.2, '💯': 0.3, '😊': 0.2,
        # Negative emojis
        '👎': -0.3, '😒': -0.2, '💔': -0.3, '😠': -0.3, '🤮': -0.4,
        '😤': -0.2, '😑': -0.1, '🙄': -0.2
    }
    
    total_score = 0
    emojis = [c for c in text if c in emoji.EMOJI_DATA]
    
    for e in emojis:
        total_score += emoji_sentiment_map.get(e, 0)
    
    # Normalize to prevent overpowering
    return max(-0.5, min(0.5, total_score))

def check_movie_relevance(text):
    """Enhanced Hinglish movie detection (returns score 0-1)"""
    # English movie terms
    english_terms = [
        "movie", "film", "cinema", "bollywood", "hollywood",
        "watch", "review", "actor", "actress", "director",
        "scene", "ending", "plot", "story", "screenplay"
    ]
    
    # Common Hinglish phrases about movies
    hinglish_phrases = [
        "paisa vasool", "time pass", "mind blowing",
        "must watch", "mat dekho", "bakwas movie",
        "hit hai", "flop hai", "time waste"
    ]
    
    text_lower = text.lower()
    
    # Check for exact matches
    term_matches = (
        sum(1 for term in english_terms if term in text_lower) +
        sum(1 for term in hinglish_terms if term in text_lower) +
        sum(1 for phrase in hinglish_phrases if phrase in text_lower)
    )
    
    # Return score based on number of matches (0.2 base score if no matches)
    return min(1.0, 0.2 + (term_matches * 0.15))

In [9]:
posts = [
    {
        "text": "Superman Discussion thread. Tell us how you enjoyed the movie✌️",
        "likes": 22,
        "comments": 6,
        "author_content_watched": 153,
        "author_reviews_posted": 135,
        "author_public_watchlists": 5,
        "media_count": 1,
        "created_at": datetime.now() - timedelta(hours=0)
    },
    {
        "text": "CBFC ne Superman movie se kissiya kaat di bhai, SUPERMAN MOVIE SE! I'm equally baffled and disappointed at the same time.",
        "likes": 18,
        "comments": 7,
        "author_content_watched": 195,
        "author_reviews_posted": 188,
        "author_public_watchlists": 2,
        "media_count": 1,
        "created_at": datetime.now() - timedelta(hours=22)
    }
]

for post in posts:
    print(f"Score: {calculate_post_score(post)} | Text: {post['text']}")

Score: 25.4 | Text: Superman Discussion thread. Tell us how you enjoyed the movie✌️
Score: 13.9 | Text: CBFC ne Superman movie se kissiya kaat di bhai, SUPERMAN MOVIE SE! I'm equally baffled and disappointed at the same time.


In [1]:
from transformers import pipeline

# Initialize once at app startup
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use cpu


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [3]:
def analyze_content(text):
    # Sentiment analysis
    sentiment_result = sentiment_analyzer(text[:512])  # Truncate to model limit
    sentiment_score = 1 if sentiment_result[0]['label'] == 'POSITIVE' else 0
    
    # Movie relevance classification
    candidate_labels = ["movie review", "film discussion", "off-topic"]
    classification = text_classifier(text[:512], candidate_labels)
    movie_score = classification['scores'][candidate_labels.index("movie review")] 
    
    return {
        "sentiment": sentiment_score,
        "movie_relevance": movie_score
    }

In [5]:
analyze_content("CBFC ne Superman movie se kissiya kaat di bhai, SUPERMAN MOVIE SE! I'm equally baffled and disappointed at the same time.")

{'sentiment': 0, 'movie_relevance': 0.5982581973075867}

In [6]:
from transformers import pipeline
from datetime import datetime, timedelta

class PostAnalyzer:
    def __init__(self):
        # Initialize models once (at app startup)
        self.sentiment_analyzer = pipeline(
            "sentiment-analysis", 
            model="cardiffnlp/twitter-xlm-roberta-base-sentiment"
        )
        self.relevance_classifier = pipeline(
            "zero-shot-classification",
            model="joeddav/xlm-roberta-large-xnli"
        )
    
    def calculate_post_score(self, post):
        """Enhanced scoring with pre-trained models"""
        # (1) Engagement Score (unchanged)
        engagement = (
            0.6 * min(post["likes"] / 100, 1.0) + 
            0.4 * min(post["comments"] / 50, 1.0)
        )

        # (2) Content Quality - Now using ML models
        analysis = self._analyze_content(post["text"])
        
        content = (
            0.5 * analysis["sentiment"] +
            0.3 * analysis["movie_relevance"] +
            0.2 * min(post["media_count"] / 4, 1.0)
        )

        # (3) Author Reputation (unchanged)
        author = (
            0.4 * min(post["author_content_watched"] / 10000, 1.0) +  
            0.3 * min(post["author_reviews_posted"] / 1000, 1.0) +
            0.3 * min(post["author_public_watchlists"] / 1000, 1.0)
        )

        # (4) Time Decay (unchanged)
        hours_old = (datetime.now() - post["created_at"]).total_seconds() / 3600
        decay = 0.5 ** (hours_old / 48)

        # Final Score
        raw_score = (0.5 * engagement + 0.3 * content + 0.1 * author) * 100
        return round(decay * raw_score, 1)

    def _analyze_content(self, text):
        """Handle text analysis with pre-trained models"""
        # Sentiment Analysis
        try:
            sentiment_result = self.sentiment_analyzer(text[:512])  # Trim to model limit
            label = sentiment_result[0]["label"].lower()
            score = sentiment_result[0]["score"]
            
            # Note: This model uses LABEL_0 (negative), LABEL_1 (neutral), LABEL_2 (positive)
            if "label_2" in label:  # Positive
                sentiment_score = score
            elif "label_0" in label:  # Negative
                sentiment_score = 1 - score
            else:  # Neutral
                sentiment_score = 0.5
        except Exception as e:
            print(f"Sentiment analysis failed: {e}")
            sentiment_score = 0.5  # Fallback value

        # Movie Relevance
        candidate_labels = [
            "movie review", 
            "film discussion", 
            "off-topic"
        ]
        
        try:
            relevance_result = self.relevance_classifier(
                text[:512], 
                candidate_labels,
                multi_label=True
            )
            movie_score = max(
                relevance_result["scores"][0],  # movie review
                relevance_result["scores"][1]   # film discussion
            )
        except Exception as e:
            print(f"Relevance classification failed: {e}")
            movie_score = 0.2  # Fallback value

        return {
            "sentiment": sentiment_score,
            "movie_relevance": movie_score
        }

In [7]:
# Usage Example
analyzer = PostAnalyzer()

print(analyzer.calculate_post_score(posts)) 

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

ValueError: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']