In [5]:
import pandas as pd

# Specify the path to your file
input_file_path = 'TSLA.csv'
output_file_path = 'Tesla2.csv'

# Attempt to read the file with pandas
try:
    df = pd.read_csv(input_file_path, encoding='utf-8')
    # Print the first 5 rows of the DataFrame
    print(df.head())
except Exception as e:
    print("Error reading the file:", e)

# Save the DataFrame to a new CSV file
try:
    df.to_csv(output_file_path, index=False)
    print(f"File successfully saved to {output_file_path}")
except Exception as e:
    print("Error saving the file as CSV:", e)


         Date      Open      High       Low     Close  Adj Close     Volume
0  29/06/2010  1.266667  1.666667  1.169333  1.592667   1.592667  281494500
1  30/06/2010  1.719333  2.028000  1.553333  1.588667   1.588667  257806500
2  01/07/2010  1.666667  1.728000  1.351333  1.464000   1.464000  123282000
3  02/07/2010  1.533333  1.540000  1.247333  1.280000   1.280000   77097000
4  06/07/2010  1.333333  1.333333  1.055333  1.074000   1.074000  103003500
File successfully saved to Tesla2.csv


In [6]:
# get the mata information of df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3562 entries, 0 to 3561
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       3562 non-null   object 
 1   Open       3562 non-null   float64
 2   High       3562 non-null   float64
 3   Low        3562 non-null   float64
 4   Close      3562 non-null   float64
 5   Adj Close  3562 non-null   float64
 6   Volume     3562 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 194.9+ KB


In [7]:
# count of null values column wise
df.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ------------------- 1. Data Preprocessing -------------------

# Remove currency symbols from price columns and convert to numeric
df['Open'] = df['Open'].astype(float)
df['High'] = df['High'].astype(float)
df['Low'] = df['Low'].astype(float)
df['Close'] = df['Close'].astype(float)
df['Volume'] = df['Volume'].astype(int)

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Handle missing values by filling them with the mean of each column
df.fillna(df.mean(), inplace=True)

# ------------------- Feature Engineering -------------------

# Create daily volatility feature
df['Volatility'] = df['High'] - df['Low']

# Create moving averages
df['MA7'] = df['Close'].rolling(window=7).mean()
df['MA14'] = df['Close'].rolling(window=14).mean()
df['MA50'] = df['Close'].rolling(window=50).mean()

# Create lag features for closing prices
df['Lag1'] = df['Close'].shift(1)
df['Lag2'] = df['Close'].shift(2)

# Drop rows with NaN values due to moving averages or lag features
df.dropna(inplace=True)

# Scale the 'Volume' column using StandardScaler
scaler = StandardScaler()
df['Volume_Scaled'] = scaler.fit_transform(df[['Volume']])

# ------------------- 2. Exploratory Data Analysis (EDA) -------------------

# Plot historical closing prices
plt.figure(figsize=(10, 6))
plt.plot(df['Date'], df['Close'], label='Closing Price')
plt.title('Historical Closing Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

# ------------------- 3. Modeling -------------------

# Define features (X) and target (y)
features = ['High', 'Low', 'Volume_Scaled', 'Volatility', 'MA7', 'MA14', 'MA50', 'Lag1', 'Lag2']
X = df[features]
y = df['Close']

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Regressor
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# ------------------- 4. Forecasting Future Data (for 2025) -------------------

def forecast_prices_for_2025(model, df, days_ahead=365):
    # Get the last available date and extend it into the future
    last_date = df['Date'].max()
    future_dates = pd.date_range(last_date, periods=days_ahead + 1, freq='B')[1:]  # Business days only
    
    # Use the last known data as the starting point for the forecast
    last_row = df.iloc[-1].copy()

    forecasted_data = []
    
    for future_date in future_dates:
        # Create new row and future features based on previous predictions
        new_row = last_row.copy()
        new_row['Date'] = future_date

        # Update moving averages dynamically using forecasted data
        last_close_values = [last_row['Lag1'], last_row['Close']]
        new_row['MA7'] = np.mean(last_close_values[-7:]) if len(last_close_values) >= 7 else np.mean(last_close_values)
        new_row['MA14'] = np.mean(last_close_values[-14:]) if len(last_close_values) >= 14 else np.mean(last_close_values)
        new_row['MA50'] = np.mean(last_close_values[-50:]) if len(last_close_values) >= 50 else np.mean(last_close_values)

        # Lag features (e.g., Lag1, Lag2)
        new_row['Lag1'] = last_row['Close']
        new_row['Lag2'] = last_row['Lag1']

        # Predict the next day's price using the model
        X_new = pd.DataFrame([new_row[features]])
        predicted_close = model.predict(X_new)[0]

        # Append new row for forecast
        new_row['Close'] = predicted_close
        forecasted_data.append(new_row)

        # Update last row to simulate moving forward in time
        last_row['Close'] = predicted_close
        last_row['Lag1'] = new_row['Close']
        last_row['Lag2'] = new_row['Lag1']
        last_row['High'] = predicted_close * (1 + np.random.normal(0, 0.01))  # Add slight randomness to High
        last_row['Low'] = predicted_close * (1 - np.random.normal(0, 0.01))  # Add slight randomness to Low
    
    # Convert the forecasted data into a DataFrame
    forecast_df = pd.DataFrame(forecasted_data)
    
    return forecast_df

# Example: Predict for 2025 using the XGBoost model
forecast_df = forecast_prices_for_2025(xgb, df, days_ahead=365)

# Plot the historical vs forecasted closing prices
plt.figure(figsize=(10, 6))
plt.plot(df['Date'], df['Close'], label='Historical Prices')
plt.plot(forecast_df['Date'], forecast_df['Close'], label='Forecasted Prices for 2025', color='r')
plt.title('Historical vs Forecasted Tesla Closing Prices for 2025')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

# ------------------- Plot for Years 2023, 2024, and 2025 -------------------

# Filter the data for years 2023, 2024, and 2025
df_filtered = df[df['Date'].dt.year >= 2023]
forecast_df_filtered = forecast_df[forecast_df['Date'].dt.year >= 2023]

# Plot the historical vs forecasted closing prices for 2023, 2024, and 2025
plt.figure(figsize=(10, 6))
plt.plot(df_filtered['Date'], df_filtered['Close'], label='Historical Prices (2023, 2024)', color='b')
plt.plot(forecast_df_filtered['Date'], forecast_df_filtered['Close'], label='Forecasted Prices (2025)', color='r')
plt.title('Historical vs Forecasted Tesla Closing Prices (2023-2025)')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

# Show the forecasted prices for 2025
print(forecast_df[['Date', 'Close']])
