In [140]:
import pandas as pd
import os


DATA_FILEPATH = "data/titanic.csv"

def fetch_data():
    """Import the data from csv to pd dataframe"""
    relaviteFilepath = os.path.join(os.path.abspath(''), DATA_FILEPATH)
    return pd.read_csv(relaviteFilepath)

# df stands for dataframe. This is the object that we will manipulate throughouht the notebook
titanicdf = fetch_data()
titanicdf

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


# 1. In the folder “Exercises” you have access to the dataset Titanic.csv presenting information about travellers with their status (survived=1 (yes) or =0 (no)). In addition, you have the information about the class (Pclass), name (Name), gender (Sex), age (Age), sibling or spouse on board (1/0), parents or children aboard (1/0), and fare price (Fare).

## (a) What is the best default rule for this dataset? (Default means without any evidence about the person)?

The best default rule is the one that has the less error without any information about the person, ie the most probable one.

In [141]:
#We could go with (1-survive) for fatality but we would run the risk of sending Nan, null, or wrong values (ex: 8 ? 'missing^' ?) to the fatality count
survivorSelector = (titanicdf['Survived'] == 1)
FatalitySelector = (titanicdf['Survived'] == 0)

survivorNumber = len(titanicdf[survivorSelector])
fatalityNumber = len(titanicdf[FatalitySelector])
errors = len(titanicdf) - (survivorNumber + fatalityNumber)

decision = 'survived' if survivorNumber > fatalityNumber else 'did not survive'

print(f"There are {survivorNumber} survivors and {fatalityNumber} fatalities, with {errors} undetermined. The best default decision is {decision}.")

There are 342 survivors and 545 fatalities, with 0 undetermined. The best default decision is did not survive.


## (b) What is the best 1R for this dataset?


We will use Sickit learn to determine the best 1R for the dataset. Of course, we must not pass the labels (Survived col)

In [142]:
from mlxtend.classifier import OneRClassifier

X_train = titanicdf.drop(columns=['Survived'], axis=1, inplace=False)
y_train = titanicdf['Survived']

oneRClass = OneRClassifier()
oneRClass.fit(X_train.to_numpy(), y_train)

print(f"Prediction based on feature {X_train.columns[oneRClass.feature_idx_]}, with break down :")
oneRClass.prediction_dict_

Prediction based on feature Name, with break down :


{'total error': 0,
 'rules (value: class)': {'Capt. Edward Gifford Crosby': 0,
  'Col. John Weir': 0,
  'Col. Oberst Alfons Simonius-Blumer': 1,
  'Don. Manuel E Uruchurtu': 0,
  'Dr. Alfred Pain': 0,
  'Dr. Alice (Farnham) Leader': 1,
  'Dr. Arthur Jackson Brewe': 0,
  'Dr. Ernest Moraweck': 0,
  'Dr. Henry William Frauenthal': 1,
  'Dr. Max Stahelin-Maeglin': 1,
  'Dr. William Edward Minahan': 0,
  'Jonkheer. John George Reuchlin': 0,
  'Lady. (Lucille Christiana Sutherland)Duff Gordon': 1,
  'Major. Archibald Willingham Butt': 0,
  'Major. Arthur Godfrey Peuchen': 1,
  'Master. Alden Gates Caldwell': 1,
  'Master. Andre Mallet': 1,
  'Master. Arthur Rice': 0,
  'Master. Assad Alexander Thomas': 1,
  'Master. Bertram Vere Dean': 1,
  'Master. Clarence Gustaf Hugo Asplund': 0,
  'Master. Eden Leslie Coutts': 1,
  'Master. Edmond Roger Navratil': 1,
  'Master. Edvin Rojj Felix Asplund': 1,
  'Master. Eino Viljami Panula': 0,
  'Master. Elias Nicola-Yarred': 1,
  'Master. Eric Rice': 0,

## (c) Can you produce a second rule based on a single attribute with a good effectiveness? You need to split the dataset into two disjoint sample, the training and the test set. For  example, used 75% for the training sample, and the remaining 25% for the test set.


This is clearly an over-fitting problem and generally a "cheating" AI using the names to declare if they survived or not.

In [143]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = titanicdf['Survived']

maxAccuracy = 0
bestPredictor = None

for col in titanicdf.drop('Survived', axis=1).columns:
    x = titanicdf[[col]]

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42, stratify=y)

    oneRClass = OneRClassifier()
    oneRClass.fit(X_train.to_numpy(), y_train)

    y_pred = oneRClass.predict(X_test.to_numpy())
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Prediction based on feature {col}.\n Accuracy: {accuracy}.\n")
    if accuracy > maxAccuracy:
        maxAccuracy, bestPredictor = accuracy, col

print(f"Found best predictor for titanic : {bestPredictor} with {maxAccuracy} accuracy")


Prediction based on feature Pclass.
 Accuracy: 0.6891891891891891.





Prediction based on feature Name.
 Accuracy: 0.6126126126126126.

Prediction based on feature Sex.
 Accuracy: 0.7702702702702703.

Prediction based on feature Age.
 Accuracy: 0.6261261261261262.

Prediction based on feature Siblings/Spouses Aboard.
 Accuracy: 0.6531531531531531.

Prediction based on feature Parents/Children Aboard.
 Accuracy: 0.6216216216216216.

Prediction based on feature Fare.
 Accuracy: 0.6711711711711712.

Found best predictor for titanic : Sex with 0.7702702702702703 accuracy


# 2. Using your select stock / market index and your decision criterion (binary or ternary) on the daily return of the next day or on the trend (daily return after 5/10 days), can you generate a 1R model using as possible predictor the volume, and the moving average (with a period of 5, 10, 20 50 or 200). You can learn on all days except the last 100 (that will be used as the test set).


In [144]:
DATA_FILEPATH = "data/CAC40.csv"

# df stands for dataframe. This is the object that we will manipulate throughouht the notebook
cac40df = fetch_data()

We add the labels (daily retunrs and ternary label)

In [145]:
#Daily return on closing price
cac40df["Daily_return"] = (cac40df["Close"] - cac40df["Close"].shift(1)) / cac40df["Close"].shift(1)

#Label
threshold = 0.005
cac40df["Prediction"] = cac40df["Daily_return"].apply(lambda x: 'up' if (x > threshold) else 'down' if (x < -threshold) else 'flat')
#Shift to have next day prediction
cac40df["Prediction"] = cac40df["Prediction"].shift(-1)

#Label
threshold = 0.005
cac40df["Prediction_nbr"] = cac40df["Daily_return"].apply(lambda x: 1 if (x > threshold) else 2 if (x < -threshold) else 0)
#Shift to have next day prediction
cac40df["Prediction_nbr"] = cac40df["Prediction_nbr"].shift(-1)

cac40df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Daily_return,Prediction,Prediction_nbr
0,1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,1832.000000,0,,up,1.0
1,1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,1860.000000,0,0.015284,up,1.0
2,1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,1874.000000,0,0.007527,flat,0.0
3,1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,1872.000000,0,-0.001067,flat,0.0
4,1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,1880.000000,0,0.004274,up,1.0
...,...,...,...,...,...,...,...,...,...,...
7999,2021-09-15,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200,-0.010424,up,1.0
8000,2021-09-16,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500,0.005919,down,2.0
8001,2021-09-17,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500,-0.007912,down,2.0
8002,2021-09-20,6450.390137,6471.089844,6389.620117,6455.810059,6455.810059,120069400,-0.017409,up,1.0


We re-do the moving average like in serie 2

In [146]:
maWindows = [5, 10, 20, 50, 200]

featurecols = []

for maWindow in maWindows:
    colName = f"ma{maWindow}"
    featurecols.append(colName)
    cac40df[colName] = cac40df["Close"].rolling(maWindow).mean()


And now we try to do oneR classifiers with these features

In [147]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cac40df = cac40df.dropna()

y = cac40df['Prediction_nbr']

maxAccuracy = 0
bestPredictor = None

featurecols.append('Volume')

rowCount = len(cac40df)
testSize = 100
trainSize = rowCount - 100

for col in featurecols:
    x = cac40df[[col]]

    X_train, X_test, y_train, y_test = x[:trainSize], x[-testSize:], y[:trainSize], y[-testSize:]

    oneRClass = OneRClassifier()
    oneRClass.fit(X_train.to_numpy(), y_train)

    y_pred = oneRClass.predict(X_test.to_numpy())
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Prediction based on feature {col}.\n Accuracy: {accuracy}.\n")
    if accuracy > maxAccuracy:
        maxAccuracy, bestPredictor = accuracy, col

print(f"Found best predictor for cac40df : {bestPredictor} with {maxAccuracy} accuracy")

Prediction based on feature ma5.
 Accuracy: 0.53.

Prediction based on feature ma10.
 Accuracy: 0.53.

Prediction based on feature ma20.
 Accuracy: 0.53.

Prediction based on feature ma50.
 Accuracy: 0.53.





Prediction based on feature ma200.
 Accuracy: 0.53.

Prediction based on feature Volume.
 Accuracy: 0.55.

Found best predictor for cac40df : Volume with 0.55 accuracy


Little test with one or two predictors from serie 2

* MA collide with closing price
* Stochastic indicator

In [148]:
collideTreshold = 1

featurecols = []

for maWindow in maWindows:
    colName = f"maCollide{maWindow}"
    featurecols.append(colName)
    cac40df[colName] = (abs(cac40df["Close"] - cac40df[f"ma{maWindow}"]) < collideTreshold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cac40df[colName] = (abs(cac40df["Close"] - cac40df[f"ma{maWindow}"]) < collideTreshold)


In [149]:
mw = 14
cac40df[f"min{mw}"] = cac40df["Close"].rolling(mw).min()
cac40df[f"max{mw}"] = cac40df["Close"].rolling(mw).max()
cac40df["Stochastic"] = 100 * ((cac40df["Close"] - cac40df[f"min{mw}"]) / (cac40df[f"max{mw}"] - cac40df[f"min{mw}"]))

featurecols.append("Stochastic")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cac40df[f"min{mw}"] = cac40df["Close"].rolling(mw).min()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cac40df[f"max{mw}"] = cac40df["Close"].rolling(mw).max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cac40df["Stochastic"] = 100 * ((cac40df["Close"] - cac40df[f"min{mw}"]) / (cac40df[f"max{mw

In [156]:

meanPrice = cac40df["Close"].mean()
threshold = meanPrice * 0.0004

def compare(close, low, high):
    if abs(close - low) < threshold:
        return 1
    if abs(high - close) < threshold:
        return 2
    return 0

maWindows = 200
# calculate mean rolling
mean = cac40df["Close"].rolling(maWindow).mean()
# calculate std rolling
std = cac40df["Close"].rolling(maWindow).std()
# calculate high and low band
cac40df["Boll_bands_high_200"] = mean + 2*std
cac40df["Boll_bands_low_200"] = mean - 2*std

cac40df["low_meeting"] = abs(cac40df["Close"] - cac40df["Boll_bands_low_200"]) < threshold
cac40df["high_meeting"] = abs(cac40df["Boll_bands_high_200"] - cac40df["Close"]) < threshold
cac40df['meetings'] = cac40df.apply(lambda x: 1 if x['high_meeting'] else 1 if x['low_meeting'] else 0, axis = 1)

featurecols.append("low_meeting")
featurecols.append("high_meeting")
featurecols.append("meetings")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cac40df["Boll_bands_high_200"] = mean + 2*std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cac40df["Boll_bands_low_200"] = mean - 2*std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cac40df["low_meeting"] = abs(cac40df["Close"] - cac40df["Boll_bands_low_200"]) < threshold
A value is trying to be

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Daily_return,Prediction,Prediction_nbr,...,maCollide50,maCollide200,min14,max14,Stochastic,Boll_bands_high_200,Boll_bands_low_200,low_meeting,high_meeting,meetings
5199,2010-09-29,3780.929932,3798.479980,3725.570068,3737.120117,3737.120117,143812000,-0.006706,down,2.0,...,False,False,3710.610107,3788.010010,34.250702,4109.436427,3348.630067,False,False,0
5200,2010-09-30,3711.959961,3782.669922,3693.459961,3715.179932,3715.179932,160919000,-0.005871,down,2.0,...,False,False,3710.610107,3788.010010,5.904173,4108.931327,3348.342568,False,False,0
5201,2010-10-01,3722.929932,3754.080078,3672.899902,3692.090088,3692.090088,135442000,-0.006215,down,2.0,...,False,False,3692.090088,3788.010010,0.000000,4107.518530,3347.955665,False,False,0
5202,2010-10-04,3689.179932,3691.800049,3640.479980,3649.810059,3649.810059,113094000,-0.011452,up,1.0,...,False,False,3649.810059,3788.010010,0.000000,4105.657120,3347.331376,False,False,0
5203,2010-10-05,3648.959961,3742.659912,3638.340088,3731.929932,3731.929932,124876600,0.022500,up,1.0,...,False,False,3649.810059,3788.010010,59.421058,4103.858589,3347.341707,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7998,2021-09-14,6672.189941,6677.069824,6613.520020,6652.970215,6652.970215,63626300,-0.003588,down,2.0,...,True,False,6652.970215,6763.080078,0.000000,7053.145908,5298.039175,False,False,0
7999,2021-09-15,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200,-0.010424,up,1.0,...,False,False,6583.620117,6763.080078,0.000000,7055.897694,5305.379991,False,False,0
8000,2021-09-16,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500,0.005919,down,2.0,...,False,False,6583.620117,6763.080078,21.714998,7059.397099,5312.014986,False,False,0
8001,2021-09-17,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500,-0.007912,down,2.0,...,False,False,6570.189941,6763.080078,0.000000,7061.707352,5319.672834,False,False,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cac40df = cac40df.dropna()

y = cac40df['Prediction_nbr']

maxAccuracy = 0
bestPredictor = None

rowCount = len(cac40df)
testSize = 100
trainSize = rowCount - 100

for col in featurecols:
    x = cac40df[[col]]

    X_train, X_test, y_train, y_test = x[:trainSize], x[-testSize:], y[:trainSize], y[-testSize:]

    oneRClass = OneRClassifier()
    oneRClass.fit(X_train.to_numpy(), y_train)

    y_pred = oneRClass.predict(X_test.to_numpy())
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Prediction based on feature {col}.\n Accuracy: {accuracy}.\n")
    if accuracy > maxAccuracy:
        maxAccuracy, bestPredictor = accuracy, col

print(f"Found best predictor for cac40df : {bestPredictor} with {maxAccuracy} accuracy")

Prediction based on feature maCollide5.
 Accuracy: 0.53.

Prediction based on feature maCollide10.
 Accuracy: 0.53.

Prediction based on feature maCollide20.
 Accuracy: 0.54.

Prediction based on feature maCollide50.
 Accuracy: 0.53.

Prediction based on feature maCollide200.
 Accuracy: 0.53.

Prediction based on feature Stochastic.
 Accuracy: 0.55.

Prediction based on feature Volume.
 Accuracy: 0.55.

Found best predictor for cac40df : Stochastic with 0.55 accuracy
