In [2]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn import metrics

# Preparing dataset

In [3]:
def load_dataset(fname):
    # read dataset
    dataset = pd.read_csv(fname, comment='#')

    # compositions to wt.%
    dataset.C *= 100
    dataset.Mn *= 100
    dataset.Si *= 100
    dataset.Cr *= 100
    dataset.Ni *= 100

    # temperatures to oC
    dataset.A1 -= 273.15
    dataset.A1prime -= 273.15
    dataset.A3 -= 273.15
    
    # hipo -> 1; hiper -> 0
    dataset['eutectoid'] = dataset['eutectoid'].map(lambda x: 1 if x == 'hiper' else 0)

    return dataset

In [4]:
raw_df = load_dataset('../../databases/Tcritical.csv')
raw_df = raw_df.drop(['file', 'macro'], axis=1)
#raw_df.head()

In [122]:
df = raw_df.copy()

'''
Variables
C, C**2, C*Mn, C*Si, C*Cr, C*Ni, 
Mn, Mn**2, Mn*Si, Mn*Cr, Mn*Ni, 
Si, Si*Si, Si*Cr, Si*Ni, 
Cr, Cr**2, Cr*Ni,
Ni, Ni**2]
'''

# df['C2'] = df['C'].apply(lambda x: x**2)
# df['CMn'] = df['C'] * df['Mn']
# df['CSi'] = df['C'] * df['Si']
# df['CCr'] = df['C'] * df['Cr']
# df['CNi'] = df['C'] * df['Ni']
# df['Mn2'] = df['Mn'].apply(lambda x: x**2)
# df['MnSi'] = df['Si'] * df['Mn']
# df['MnCr'] = df['Cr'] * df['Mn']
# df['MnNi'] = df['Ni'] * df['Mn']
# df['Si'] = df['Si'].apply(lambda x: x**2)
# df['SiCr'] = df['Cr'] * df['Si']
# df['SiNi'] = df['Si'] * df['Ni']
# df['Cr2'] = df['Cr'].apply(lambda x: x**2)
# df['CrNi'] = df['Cr'] * df['Ni']
# df['Ni2'] = df['Ni'].apply(lambda x: x**2)

df.head()

Unnamed: 0,C,Mn,Si,Cr,Ni,A1,A1prime,A3,eutectoid
0,0.0,0.0001,0.0001,0.0001,0.0001,,,911.65,0
1,0.0,0.0001,0.0001,0.0001,0.750075,,,880.26,0
2,0.0,0.0001,0.0001,0.0001,1.50005,,,853.25,0
3,0.0,0.0001,0.0001,0.0001,2.250025,,,829.6,0
4,0.0,0.0001,0.0001,0.0001,3.0,,,808.65,0


# Preparing eutectoid dataset

In [123]:
df_eutec = df.copy()
df_eutec = df.drop(['A1', 'A1prime', 'A3'], axis=1)
df_eutec['eutectoid'].value_counts()

1    4542
0    2333
Name: eutectoid, dtype: int64

In [124]:
df_eutec.head()

Unnamed: 0,C,Mn,Si,Cr,Ni,eutectoid
0,0.0,0.0001,0.0001,0.0001,0.0001,0
1,0.0,0.0001,0.0001,0.0001,0.750075,0
2,0.0,0.0001,0.0001,0.0001,1.50005,0
3,0.0,0.0001,0.0001,0.0001,2.250025,0
4,0.0,0.0001,0.0001,0.0001,3.0,0


# Eutectoid logistic regression

## Split between train and test datasets

In [125]:
def _split_between_train_and_test(df, size):
    #train size may be 0.8, 0.7 or 0.99
    target = df['eutectoid']
    features = df.iloc[:, :20]
    X_train, X_test, y_train, y_test = train_test_split(
        features,
        target,
        train_size=size,
        random_state=42
    )
    
    X_train = X_train.drop(['eutectoid'], axis=1)
    X_test = X_test.drop(['eutectoid'], axis=1)
    
    print('Train size: {}'.format(X_train.shape))
    print('Test size: {}'.format(X_test.shape))
    return X_train, X_test, y_train, y_test

In [126]:
X_train, X_test, y_train, y_test = _split_between_train_and_test(df_eutec, 0.8)

Train size: (5500, 5)
Test size: (1375, 5)




In [127]:
X_train.head()

Unnamed: 0,C,Mn,Si,Cr,Ni
1905,0.45,0.0001,0.750075,0.750075,0.0001
4383,1.05,0.0001,0.0001,0.750075,2.250025
3538,0.75,2.250025,0.750075,1.50005,2.250025
4943,1.05,3.0,1.50005,2.250025,2.250025
5809,1.35,0.750075,1.50005,0.750075,3.0


In [128]:
y_train.head()

1905    0
4383    1
3538    1
4943    1
5809    1
Name: eutectoid, dtype: int64

## Logistic regression by itself

In [129]:
lr = linear_model.LogisticRegression()
# lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Checking accuracy of regression

In [130]:
print("Logistic regression Train Accuracy :: ", metrics.accuracy_score(y_train, lr.predict(X_train)))
print("Logistic regression Test Accuracy :: ", metrics.accuracy_score(y_test, lr.predict(X_test)))

Logistic regression Train Accuracy ::  0.978
Logistic regression Test Accuracy ::  0.9745454545454545


In [133]:
print(-lr.intercept_/lr.coef_[0][0])
print(-lr.coef_[0]/lr.coef_[0][0])

def carbon_eutectoid(X):
    # k = logit - beta_C*C
    if isinstance(X, pd.DataFrame):
        X = X.values
    elif isinstance(X, list):
        X = np.array(X)
    k = lr.intercept_ + np.dot(X[:,1:], lr.coef_[0][1:])
    C = -k/lr.coef_[0][0]  # carbon content in the threshould
    return C

# Carbon in the eutectoid point
carbon_eutectoid([[0,0,0,0,0]])

[0.73879187]
[-1.         -0.04165555  0.01858358 -0.10860868 -0.04024891]


array([0.73879187])