In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources (run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
nepse_tweets = pd.read_csv("nepse_tweets.csv")  # Replace with your dataset's path

# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for preprocessing
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r"@\w+|#\w+", '', text)
    # Remove numbers and special characters
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the text column
nepse_tweets['cleaned_text'] = nepse_tweets['text'].apply(preprocess_text)

# Save the cleaned dataset (optional)
nepse_tweets.to_csv("cleaned_nepse_tweets.csv", index=False)

# Display a preview of the cleaned data
print(nepse_tweets[['text', 'cleaned_text']].head())





[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aryankarki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryankarki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aryankarki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                text  \
0  Today's News   \n(Friday, December 6th, 2024) ...   
1  Good Friday - Bhagavad Gita Chapter 9, Verse 2...   
2  The Nepal Stock Exchange (NEPSE) index decreas...   
3  HRL aligns well with its fundamentals, showing...   
4  eyeframe tales \nfor educational purpose only ...   

                                        cleaned_text  
0                      today news friday december th  
1  good friday bhagavad gita chapter verse assura...  
2  nepal stock exchange nepse index decreased poi...  
3  hrl aligns well fundamental showing promising ...  
4                  eyeframe tale educational purpose  


In [None]:
import pandas as pd

# Example data creation (replace with your actual data loading process)
data = {
    "Today's News": [
        "Today's News\n(Friday, December 6th, 2024)...",
        "Good Friday - Bhagavad Gita Chapter 9, Verse 2...",
        "The Nepal Stock Exchange (NEPSE) index decreas...",
        "HRL aligns well with its fundamentals, showing...",
        "eyeframe tales\nfor educational purpose only...",
    ],
    "cleaned_text": [
        "today news friday december th",
        "good friday bhagavad gita chapter verse assura...",
        "nepal stock exchange nepse index decreased poi...",
        "hrl aligns well fundamental showing promising...",
        "eyeframe tale educational purpose",
    ]
}
df = pd.DataFrame(data)


In [None]:
output_file_path = "processed_tweets.csv"
df.to_csv(output_file_path, index=False)

print(f"Data saved successfully to {output_file_path}")



In [None]:
output_file_path = "processed_tweets.csv"
nepse_tweets.to_csv(output_file_path, index=False)

print(f"Data saved successfully to {output_file_path}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from transformers import pipeline
from datetime import datetime

# Load the dataset
df = pd.read_csv("processed_tweets.csv")

# Ensure 'created_at' is parsed as datetime if it's present in your dataset
if 'created_at' in df.columns:
    df['created_at'] = pd.to_datetime(df['created_at'])

### TEXT FEATURES ###

# 1. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Adjust max_features as needed
tfidf_features = tfidf_vectorizer.fit_transform(df['cleaned_text']).toarray()

# Convert to DataFrame for better handling
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf_vectorizer.get_feature_names_out())

# 2. Embedding using BERT (Optional for semantic understanding)
# Load a BERT model pipeline for feature extraction
bert_pipeline = pipeline('feature-extraction', model='bert-base-uncased', tokenizer='bert-base-uncased')

# Generate embeddings (optional, comment if not needed for now)
# df['bert_embeddings'] = df['cleaned_text'].apply(lambda x: bert_pipeline(x)[0][0])  # [CLS] token embeddings

### METADATA FEATURES ###

# Normalize engagement metrics (assuming columns like 'favorite_count', 'retweet_count', 'reply_count' exist)
engagement_columns = ['favorite_count', 'retweet_count', 'reply_count']  # Adjust to your actual column names
if all(col in df.columns for col in engagement_columns):
    scaler = MinMaxScaler()
    engagement_features = scaler.fit_transform(df[engagement_columns])
    engagement_df = pd.DataFrame(engagement_features, columns=[f'normalized_{col}' for col in engagement_columns])
else:
    engagement_df = pd.DataFrame()

# Time-based patterns
if 'created_at' in df.columns:
    df['hour'] = df['created_at'].dt.hour  # Extract hour
    df['day_of_week'] = df['created_at'].dt.dayofweek  # Extract day of the week

### FINAL DATAFRAME ###

# Combine all features
final_df = pd.concat(
    [
        tfidf_df, 
        engagement_df, 
        df[['hour', 'day_of_week']] if 'created_at' in df.columns else pd.DataFrame()
    ],
    axis=1
)

# Save the processed features to a new CSV
output_file = "features_processed_tweets.csv"
final_df.to_csv(output_file, index=False)
print(f"Features saved successfully to {output_file}")



  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
