In [None]:
!pip install transformers
!pip install torch
!pip install pandas

In [None]:
# Step 1: Install TextBlob
!pip install textblob

# Step 2: Import libraries
import pandas as pd
import re
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

# Step 3: Read raw content of the file
# Since the CSV is malformed (no clean separation), we read it as text
with open('/kaggle/input/radditnews/RedditNews.csv', 'r', encoding='utf-8') as f:
    lines = f.readlines()

print("First few lines of raw data:")
print("".join(lines[:2]) + "\n")

# Step 4: Parse lines and extract (date, headline) pairs
data = []
date_pattern = r'\d{4}-\d{2}-\d{2}'  # Match YYYY-MM-DD
current_date = None

for line in lines:
    # Split line into tokens (potential dates and headlines)
    tokens = re.split(r'(?=\d{4}-\d{2}-\d{2})', line.strip())

    for token in tokens:
        if not token:
            continue

        # Extract date at the beginning of token
        match = re.match(date_pattern, token)
        if match:
            current_date = match.group()
            # The rest is the headline
            headline = token[len(current_date):].strip()
        else:
            headline = token.strip()

        # Skip if no valid date or empty headline
        if not current_date or not headline:
            continue

        data.append({'Date': current_date, 'Headline': headline})

# Step 5: Create DataFrame
df = pd.DataFrame(data)
print(f"\n Extracted {len(df)} headlines from {df['Date'].nunique()} unique days.")

# Step 6: Clean Headline column
def clean_headline(text):
    # Remove b'...' or b"..." wrapper
    text = re.sub(r"^b'(.*)'$", r"\1", text)
    text = re.sub(r'^b"(.*)"$', r'\1', text)
    # Remove escaped quotes and backslashes
    text = text.replace("\\'", "'").replace('\\"', '"').replace('\\\\', '\\')
    text = text.replace("\\r", " ").replace("\\n", " ").strip()
    return text

df['Headline'] = df['Headline'].astype(str).apply(clean_headline)

# Filter out placeholder-like or garbage
invalid = ['nan', '', 'b?', 'b', '""', "''"]
df = df[~df['Headline'].isin(invalid)]
df = df[df['Headline'].str.len() > 5]

# Step 7: Sentiment analysis function
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity  # Returns -1 to 1

print("\n Computing sentiment for each headline...")
df['Sentiment'] = df['Headline'].apply(get_sentiment)

# Step 8: Group by Date → average sentiment
daily_sentiment = df.groupby('Date')['Sentiment'].mean().reset_index()
daily_sentiment.rename(columns={'Sentiment': 'Daily_Sentiment'}, inplace=True)
daily_sentiment['Date'] = pd.to_datetime(daily_sentiment['Date'])  # Ensure proper date type

# Sort by date
daily_sentiment.sort_values('Date', inplace=True)
daily_sentiment.reset_index(drop=True, inplace=True)

# Step 9: Display results
print("\n Sample Daily Sentiment (first 10 days):")
print(daily_sentiment.head(10))

print("\n Daily Sentiment Statistics:")
print(daily_sentiment['Daily_Sentiment'].describe())

# Step 10: Save and download
output_file = 'RedditNews_Daily_Sentiment.csv'
daily_sentiment.to_csv(output_file, index=False, encoding='utf-8')
print(f"\n Saved daily sentiment to: {output_file}")


# To get sentiment score:

In [None]:
from textblob import TextBlob

def get_sentiment_score(sentence):
    # Create a TextBlob object
    blob = TextBlob(sentence)

    # Get the polarity score (-1 to 1)
    sentiment_score = blob.sentiment.polarity

    return sentiment_score

# Get user input
user_sentence = input("Enter a sentence to analyze its sentiment: ")

# Calculate and display the sentiment score
score = get_sentiment_score(user_sentence)
print(f"\nSentiment score for '{user_sentence}': {score:.3f}")



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf

In [None]:
start = '2008-06-08'
end ='2016-07-01'

stock ='GOOG'
data = yf.download(stock, start=start, end=end)

# Save to CSV
data.to_csv('GOOG_2008-2016.csv')
print("Data downloaded and saved to GOOG_2008-2016.csv")

In [None]:
import pandas as pd

# Load the datasets
try:
    df_reddit = pd.read_csv('RedditNews_Daily_Sentiment.csv')
    df_goog = pd.read_csv('GOOG_2008-2016.csv')
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure the files are uploaded to your Colab environment.")
    exit()

# Preprocess GOOG data
# Drop the first two rows (redundant headers)
df_goog = df_goog.iloc[2:].copy()

# Rename the first column to 'Date' for merging
df_goog.rename(columns={df_goog.columns[0]: 'Date'}, inplace=True)

# Convert 'Date' columns to datetime objects for accurate merging
df_reddit['Date'] = pd.to_datetime(df_reddit['Date'])
df_goog['Date'] = pd.to_datetime(df_goog['Date'])

# Merge the two DataFrames based on the 'Date' column
merged_df = pd.merge(df_goog, df_reddit, on='Date', how='left')

# Handle missing values in 'Daily_Sentiment' column
# It's good practice to inspect how many missing values are there before filling
print("Number of missing values in 'Daily_Sentiment' before handling:")
print(merged_df['Daily_Sentiment'].isnull().sum())

# For simplicity, we'll fill missing sentiment values with 0,
# assuming no sentiment data implies a neutral sentiment.
# You might choose a different strategy based on your analysis needs (e.g., forward fill, mean)
merged_df['Daily_Sentiment'].fillna(0, inplace=True)

print("\nNumber of missing values in 'Daily_Sentiment' after handling:")
print(merged_df['Daily_Sentiment'].isnull().sum())

# Display the first few rows of the merged DataFrame
print("\nFirst 5 rows of the merged DataFrame:")
print(merged_df.head())

# Display information about the merged DataFrame to confirm data types and non-null counts
print("\nInformation about the merged DataFrame:")
print(merged_df.info())

# Optionally, save the merged DataFrame to a new CSV file
merged_df.to_csv('GOOG_with_Sentiment.csv', index=False)
print("\nMerged DataFrame saved to 'GOOG_with_Sentiment.csv'")

In [None]:
import pandas as pd

# --- Step 1: Load the datasets ---
try:
    df_reddit = pd.read_csv('RedditNews_Daily_Sentiment.csv')
    df_goog = pd.read_csv('GOOG_2008-2016.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure the files are uploaded to your Colab environment.")
    exit()

# --- Step 2: Preprocess GOOG Data ---
# Drop the first two rows (redundant headers) from GOOG data
df_goog = df_goog.iloc[2:].copy()
print("Dropped redundant header rows from GOOG data.")

# Rename the first column to 'Date' for merging
df_goog.rename(columns={df_goog.columns[0]: 'Date'}, inplace=True)
print("Renamed first column of GOOG data to 'Date'.")

# Convert 'Date' columns to datetime objects for accurate merging
df_reddit['Date'] = pd.to_datetime(df_reddit['Date'])
df_goog['Date'] = pd.to_datetime(df_goog['Date'])
print("Converted 'Date' columns to datetime format.")

# --- Step 3: Merge DataFrames ---
# Merge GOOG data with Reddit sentiment data based on the 'Date' column
# 'how='left'' ensures all GOOG dates are kept, filling NaN for unmatched sentiment dates
merged_df = pd.merge(df_goog, df_reddit, on='Date', how='left')
print("Merged GOOG data with Reddit sentiment data.")

# Handle missing values in 'Daily_Sentiment' column
# Fill NaN values (where no sentiment data exists for a date) with 0, assuming neutral sentiment.
# Consider other imputation strategies (e.g., forward fill, mean) if your analysis requires.
initial_nulls = merged_df['Daily_Sentiment'].isnull().sum()
merged_df['Daily_Sentiment'].fillna(0, inplace=True)
print(f"Filled {initial_nulls} missing sentiment values with 0.")

# --- Step 4: Add 'tomorrow's closing value' as the output feature ---
# Create the 'tomorrow's closing value' column by shifting the 'Close' column upwards by 1
# This makes the closing value of the next day the target for the current day's features.
merged_df['tomorrow\'s closing value'] = merged_df['Close'].shift(-1)
print("Added 'tomorrow\'s closing value' column by shifting 'Close' prices.")

# Handle the last cell of the 'tomorrow's closing value' column
# The last value will be NaN after the shift. As requested, we fill it
# with the 'Close' value from the second-to-last day (previous 2 value).
if not merged_df.empty and pd.isna(merged_df.iloc[-1]['tomorrow\'s closing value']):
    if len(merged_df) >= 2:
        # Fill the last row's 'tomorrow\'s closing value' with the 'Close' of the second to last day
        merged_df.loc[merged_df.index[-1], 'tomorrow\'s closing value'] = merged_df.iloc[-2]['Close']
        print("Filled the last 'tomorrow\'s closing value' with the 'Close' price from the second to last day.")
    elif len(merged_df) == 1:
        # Edge case: if only one row, fill with its own close value.
        merged_df.loc[merged_df.index[-1], 'tomorrow\'s closing value'] = merged_df.iloc[-1]['Close']
        print("Dataset has only one row; filled 'tomorrow\'s closing value' with current day's 'Close'.")

# --- Step 5: Display and Save Results ---
print("\n--- Processed DataFrame Overview ---")
print("First 5 rows:")
print(merged_df.head())

print("\nLast 5 rows:")
print(merged_df.tail())

print("\nDataFrame Information (dtypes and non-null counts):")
print(merged_df.info())

# Optionally, save the final processed DataFrame to a new CSV file
output_filename = 'GOOG_with_Tomorrow_Close_Final.csv'
merged_df.to_csv(output_filename, index=False)
print(f"\nFinal processed DataFrame saved to '{output_filename}'")


**After preprocessing we find the final dataset neamed GOOG_with_Tomorrow_Close_Final.csv**