In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [6]:
# Define the file path
file_path = 'NIFTY50_all.csv'

# Load the dataset
df = pd.read_csv(file_path)



In [7]:
df.head()

Unnamed: 0,Date,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2007-11-27,MUNDRAPORT,EQ,440.0,770.0,1050.0,770.0,959.0,962.9,984.72,27294366,2687719000000000.0,,9859619.0,0.3612
1,2007-11-28,MUNDRAPORT,EQ,962.9,984.0,990.0,874.0,885.0,893.9,941.38,4581338,431276500000000.0,,1453278.0,0.3172
2,2007-11-29,MUNDRAPORT,EQ,893.9,909.0,914.75,841.0,887.0,884.2,888.09,5124121,455065800000000.0,,1069678.0,0.2088
3,2007-11-30,MUNDRAPORT,EQ,884.2,890.0,958.0,890.0,929.0,921.55,929.17,4609762,428325700000000.0,,1260913.0,0.2735
4,2007-12-03,MUNDRAPORT,EQ,921.55,939.75,995.0,922.0,980.0,969.3,965.65,2977470,287520000000000.0,,816123.0,0.2741


In [8]:
corr_matrix = df.select_dtypes(include=['number']).corr()
corr_matrix

Unnamed: 0,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
Prev Close,1.0,0.999718,0.999616,0.999601,0.999505,0.999518,0.999622,-0.118742,0.052888,-0.149562,-0.132987,-0.009034
Open,0.999718,1.0,0.999857,0.999863,0.999748,0.999761,0.999867,-0.118779,0.052704,-0.14984,-0.133025,-0.009078
High,0.999616,0.999857,1.0,0.99979,0.999881,0.999892,0.999937,-0.118595,0.053697,-0.14886,-0.132979,-0.009962
Low,0.999601,0.999863,0.99979,1.0,0.999868,0.999878,0.999929,-0.118914,0.052144,-0.150774,-0.133141,-0.008543
Last,0.999505,0.999748,0.999881,0.999868,1.0,0.999991,0.99995,-0.118713,0.05311,-0.149736,-0.133051,-0.009367
Close,0.999518,0.999761,0.999892,0.999878,0.999991,1.0,0.999963,-0.118705,0.053106,-0.149751,-0.133042,-0.009362
VWAP,0.999622,0.999867,0.999937,0.999929,0.99995,0.999963,1.0,-0.118757,0.053013,-0.149787,-0.133096,-0.009425
Volume,-0.118742,-0.118779,-0.118595,-0.118914,-0.118713,-0.118705,-0.118757,1.0,0.629582,0.70377,0.829507,-0.209112
Turnover,0.052888,0.052704,0.053697,0.052144,0.05311,0.053106,0.053013,0.629582,1.0,0.840716,0.573714,-0.194729
Trades,-0.149562,-0.14984,-0.14886,-0.150774,-0.149736,-0.149751,-0.149787,0.70377,0.840716,1.0,0.567846,-0.252629


In [9]:
df.describe()

Unnamed: 0,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
count,235192.0,235192.0,235192.0,235192.0,235192.0,235192.0,235192.0,235192.0,235192.0,120344.0,219115.0,219115.0
mean,1266.196349,1267.759708,1286.58144,1247.488465,1266.388302,1266.554351,1267.1323,3045903.0,161013800000000.0,61964.27,1315098.0,0.502997
std,2581.37032,2585.259609,2619.649216,2546.621396,2581.392543,2582.140942,2582.69998,7333981.0,329808500000000.0,68664.57,2831670.0,0.190019
min,0.0,8.5,9.75,8.5,9.1,9.15,9.21,3.0,10470000.0,11.0,5.0,0.0236
25%,274.3,275.0,279.5,269.6,274.4,274.35,274.6975,219009.5,16128160000000.0,21834.0,125383.0,0.3647
50%,566.5,567.025,576.9,556.5,567.0,566.7,566.94,1010938.0,68326030000000.0,44068.0,501756.0,0.511
75%,1242.2,1243.3125,1263.0,1221.65,1242.9,1242.4,1242.6625,3019851.0,186383500000000.0,78935.5,1452233.0,0.6384
max,32861.95,33399.95,33480.0,32468.1,32849.0,32861.95,32975.24,481058900.0,3.564334e+16,1643015.0,232530700.0,1.0


In [10]:
# 1. Define Features (X) and Target (y) after checking corelation between close and other column 

features = ['Open', 'High', 'Low', 'Prev Close','VWAP']
target = ['Close']

X = df[features]
y = df[target]

In [11]:
# 2. Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = LinearRegression()
print("Training model...")
model.fit(X_train, y_train)
print("Model training complete.")

Training model...
Model training complete.


In [13]:
# 4. Make Predictions on the Test Set
y_pred = model.predict(X_test)


In [14]:
# 5. Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [15]:
print("\n--- Model Evaluation ---")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²) Score: {r2}")


--- Model Evaluation ---
Mean Squared Error (MSE): 370.3286249835853
R-squared (R²) Score: 0.9999452719180613


In [16]:
model_filename = 'nifty50model.joblib'
joblib.dump(model, model_filename)


['nifty50model.joblib']