# Predicting daily price

In [1]:
import yfinance as yf
import pandas as pd 
import numpy as np 

In [7]:
# obtain nifty 50 symbols list
nifty50 = pd.read_csv('../data/nifty-50.csv')
nifty_50_symlist = (nifty50['SYMBOL \n']).to_list()
nifty_50_symlist.remove("NIFTY 50.NS")
nifty_50_symlist

ValueError: list.remove(x): x not in list

In [69]:
# downloading nifty 50 data from yfinance

n50_df = yf.download(tickers=nifty_50_symlist, start="2014-01-01", period='1d')

[*********************100%%**********************]  50 of 50 completed


In [70]:
cipla_df = yf.download(tickers='CIPLA.NS', period='1d', start='2014-01-01')
cipla_df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01,402.500000,405.899994,400.799988,401.750000,383.688110,429074
2014-01-02,402.000000,406.350006,391.500000,392.299988,374.662933,734194
2014-01-03,391.549988,394.000000,386.000000,391.750000,374.137695,998001
2014-01-06,393.899994,395.950012,387.649994,391.600006,373.994446,752348
2014-01-07,393.899994,394.899994,389.000000,390.850006,373.278137,955034
...,...,...,...,...,...,...
2024-05-24,1489.150024,1493.900024,1479.050049,1486.449951,1486.449951,758326
2024-05-27,1491.000000,1492.550049,1475.199951,1478.650024,1478.650024,1393334
2024-05-28,1482.949951,1494.750000,1475.199951,1479.449951,1479.449951,784496
2024-05-29,1475.250000,1497.599976,1472.599976,1493.550049,1493.550049,1607691


## Technical analysis indicator addition

In [71]:
import pandas_ta as ta 

In [72]:
cipla_df['5-MA'] = ta.ema(cipla_df['Adj Close'], length=5)
cipla_df['20-MA'] = ta.ema(cipla_df['Adj Close'], length=20)
cipla_df['50-MA'] = ta.ema(cipla_df['Adj Close'], length=50)
cipla_df['200-MA'] = ta.ema(cipla_df['Adj Close'], length=200)
cipla_df['RSI'] = ta.rsi(cipla_df['Adj Close'], length=14)
adx = ta.adx(cipla_df['High'], cipla_df['Low'], cipla_df['Adj Close'])
cipla_df['ADX'] = adx['ADX_14']
cipla_df['Relative Vol'] = round(cipla_df['Volume'] / ta.sma(cipla_df['Volume'], length=60), 2)
cipla_df['next open'] = cipla_df['Open'].shift(-1)
cipla_df['target'] = cipla_df['Adj Close'].shift(-1)

change = (cipla_df['Adj Close'] - cipla_df['Open']) / cipla_df['Open'] * 100
change[(change >= 2) | (change <= -2)] = 1
change[change != 1] = 0

cipla_df['target2'] = change

cipla_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,5-MA,20-MA,50-MA,200-MA,RSI,ADX,Relative Vol,next open,target,target2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2014-01-01,402.5,405.899994,400.799988,401.75,383.68811,429074,,,,,,,,402.0,374.662933,1.0
2014-01-02,402.0,406.350006,391.5,392.299988,374.662933,734194,,,,,,,,391.549988,374.137695,1.0
2014-01-03,391.549988,394.0,386.0,391.75,374.137695,998001,,,,,,,,393.899994,373.994446,1.0
2014-01-06,393.899994,395.950012,387.649994,391.600006,373.994446,752348,,,,,,,,393.899994,373.278137,1.0
2014-01-07,393.899994,394.899994,389.0,390.850006,373.278137,955034,375.952264,,,,,,,391.100006,386.027954,1.0


In [73]:
# removing null values
cipla_df.dropna(inplace=True)
cipla_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,5-MA,20-MA,50-MA,200-MA,RSI,ADX,Relative Vol,next open,target,target2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2014-10-27,629.799988,631.099976,620.099976,628.099976,602.597839,1316795,589.714743,574.333604,540.596454,424.917971,65.886795,26.430349,0.66,630.0,619.914978,1.0
2014-10-28,630.0,648.799988,625.700012,646.150024,619.914978,1860143,599.781488,578.674687,543.706984,426.85824,70.516292,27.131536,0.92,652.900024,630.228577,0.0
2014-10-29,652.900024,658.799988,649.0,656.900024,630.228577,2970820,609.930517,583.584581,547.099988,428.881825,72.877127,28.122499,1.45,656.25,624.184265,1.0
2014-10-30,656.25,663.799988,647.0,650.599976,624.184265,2236206,614.681767,587.451218,550.122901,430.825133,69.371344,29.20548,1.07,652.700012,639.486633,1.0
2014-10-31,652.700012,673.0,647.799988,666.549988,639.486633,1185420,622.950056,592.406972,553.627361,432.901367,72.922711,30.499187,0.57,670.0,636.176819,1.0


### Training regressiom model to predict next days close price 



In [74]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

features = [
    '5-MA',
    '20-MA',
    '50-MA', 
    '200-MA', 
    'Volume',
    'Adj Close', 
    'next open'
]

X_train, X_test, y_train, y_test = train_test_split(cipla_df[features], cipla_df['target'], test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
print(f"R2 Score: {r2_score(y_test, y_pred)}")


Mean Squared Error: 130.62547825499698
R2 Score: 0.9977955743597975


## model to classify to higher change in price on given day

In [75]:
cipla_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', '5-MA', '20-MA',
       '50-MA', '200-MA', 'RSI', 'ADX', 'Relative Vol', 'next open', 'target',
       'target2'],
      dtype='object')

In [81]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import label_binarize

features = ['Adj Close', 'Volume', '5-MA', '20-MA',
       '50-MA', '200-MA', 'RSI', 'ADX', 'Relative Vol', 'next open',
       ]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cipla_df[features], cipla_df['target2'], test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
    # "Random Forest": RandomForestClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier()
}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    # roc_auc = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print(f'Classifier: {name}')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    # print(f'ROC AUC Score: {roc_auc:.2f}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print(f'Classification Report:\n{class_report}')
    print('-' * 60)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier: Logistic Regression
Accuracy: 0.67
Precision: 0.33
Recall: 0.50
F1 Score: 0.40
Confusion Matrix:
[[  0 157]
 [  0 316]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       157
         1.0       0.67      1.00      0.80       316

    accuracy                           0.67       473
   macro avg       0.33      0.50      0.40       473
weighted avg       0.45      0.67      0.54       473

------------------------------------------------------------
Classifier: Support Vector Machine
Accuracy: 0.67
Precision: 0.33
Recall: 0.50
F1 Score: 0.40
Confusion Matrix:
[[  0 157]
 [  1 315]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       157
         1.0       0.67      1.00      0.80       316

    accuracy                           0.67       473
   macro avg       0.33      0.50      0.40       473
weighted avg       0.45    