<a href="https://colab.research.google.com/github/nandakishan-jinu/Stock-price-analysis/blob/main/proj_stock_(f1_score).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score



In [3]:
# Updated cleaning function to handle the dataset structure
def clean_stock_data(file_path):
    # Load the raw data
    raw_data = pd.read_csv(file_path)

    # Skip the first two rows (metadata and incomplete headers)
    data = raw_data[2:]

    # Assign proper column names from the first valid row
    data.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']

    # Drop the first row of actual data since it was used for column names
    data = data[1:]

    # Drop rows with missing values
    data.dropna(inplace=True)

    # Convert 'Date' column to datetime
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
    data.dropna(subset=['Date'], inplace=True)  # Remove rows with invalid dates

    # Set the 'Date' column as the index
    data.set_index('Date', inplace=True)

    # Convert relevant columns to numeric types
    for col in ['Close', 'High', 'Low', 'Open', 'Volume']:
        data[col] = pd.to_numeric(data[col], errors='coerce')

    # Drop rows where numeric conversion failed
    data.dropna(inplace=True)

    return data



In [9]:
# Step 2: Comprehensive Time Series Analysis
def time_series_analysis(data):

    data['Daily Return'] = data['Close'].pct_change()

    return data

In [4]:
# Step 3: Machine Learning Model with F1 Score
def train_classification_model(data):
    """
    Train a classification model to predict stock price movement (Up/Down).
    """
    # Define target as price movement: 1 for Up, 0 for Down
    data['Target'] = (data['Close'].shift(-1) > data['Close']).astype(int)
    data.dropna(inplace=True)

    # Features and target
    features = ['Close', 'High', 'Low', 'Open', 'Volume', 'Daily Return']
    X = data[features]
    y = data['Target']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest Classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)


    # Evaluate the model
    f1 = f1_score(y_test, y_pred)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"F1 Score: {f1}")

    return model, f1

In [10]:
# Main function to execute the workflow
def main(file_path):
    # Clean the dataset
    cleaned_data = clean_stock_data(file_path)
    analyzed_data = time_series_analysis(cleaned_data)
    # Train classification model
    model, f1 = train_classification_model(analyzed_data)
    print(f"Model trained. F1 Score: {f1}")


In [11]:
# Run the workflow
main('/content/drive/MyDrive/META_stock_data.csv')

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.58      0.64        12
           1       0.55      0.67      0.60         9

    accuracy                           0.62        21
   macro avg       0.62      0.62      0.62        21
weighted avg       0.63      0.62      0.62        21

Confusion Matrix:
 [[7 5]
 [3 6]]
F1 Score: 0.6
Model trained. F1 Score: 0.6


In [12]:
# Run the workflow
main('/content/drive/MyDrive/Netflix_stock_data.csv')

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.36      0.47        11
           1       0.53      0.80      0.64        10

    accuracy                           0.57        21
   macro avg       0.60      0.58      0.56        21
weighted avg       0.60      0.57      0.55        21

Confusion Matrix:
 [[4 7]
 [2 8]]
F1 Score: 0.64
Model trained. F1 Score: 0.64


In [13]:
# Run the workflow
main('/content/drive/MyDrive/Apple_stock_data.csv')

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.38      0.43         8
           1       0.67      0.77      0.71        13

    accuracy                           0.62        21
   macro avg       0.58      0.57      0.57        21
weighted avg       0.60      0.62      0.61        21

Confusion Matrix:
 [[ 3  5]
 [ 3 10]]
F1 Score: 0.7142857142857143
Model trained. F1 Score: 0.7142857142857143


In [14]:
# Run the workflow
main('/content/drive/MyDrive/Microsoft_stock_data.csv')

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.33      0.35         9
           1       0.54      0.58      0.56        12

    accuracy                           0.48        21
   macro avg       0.46      0.46      0.46        21
weighted avg       0.47      0.48      0.47        21

Confusion Matrix:
 [[3 6]
 [5 7]]
F1 Score: 0.56
Model trained. F1 Score: 0.56


In [15]:
# Run the workflow
main('/content/drive/MyDrive/Amazon_stock_data.csv')

Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.30      0.35        10
           1       0.50      0.64      0.56        11

    accuracy                           0.48        21
   macro avg       0.46      0.47      0.46        21
weighted avg       0.47      0.48      0.46        21

Confusion Matrix:
 [[3 7]
 [4 7]]
F1 Score: 0.56
Model trained. F1 Score: 0.56


In [16]:
# Run the workflow
main('/content/drive/MyDrive/GOOGLE_stock_data.csv')

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.30      0.38        10
           1       0.53      0.73      0.62        11

    accuracy                           0.52        21
   macro avg       0.52      0.51      0.50        21
weighted avg       0.52      0.52      0.50        21

Confusion Matrix:
 [[3 7]
 [3 8]]
F1 Score: 0.6153846153846154
Model trained. F1 Score: 0.6153846153846154
