In [6]:
import pandas as pd
import numpy as np
import datetime
pd.options.mode.chained_assignment = None  # default='warn'

def is_number(s):
    '''Check if entry is a number'''
    try:
        float(s)
        return True
    except ValueError:
        return False

In [4]:
# converter = lambda x: pd.to_numeric(x, 'coerce')
inp = pd.read_csv('training_inputs.csv', dtype=object, sep=';')
out = pd.read_csv('training_outputs.csv', dtype=object, sep=';')
inp = inp.reset_index(drop=True)

inp = inp.replace(r'.*false.*', 'False', regex=True)
inp = inp.replace(r'.*False.*', 'False', regex=True)

inp = inp.replace(r'.*True.*', 'True', regex=True)
inp = inp.replace(r'.*true.*', 'True', regex=True)

inp = inp.replace('NaN', np.nan, regex=True)

In [24]:
# convert columns
df = inp.copy()

df['ID'] = pd.to_numeric(df['ID'])

# S columns
# change numbers since they are site visits and page views, quantity
s_cols = [col for col in df if col.startswith('S')]
s_cols = s_cols[1:]
df['S1'] = pd.to_numeric(df['S1'], errors='coerce')
date_cols = []

for col in s_cols:
    if df[col].str.isdigit().any():
        df[col] = pd.to_numeric(df[col])
    else:
        df[col] = df[col].astype('<M8[ns]')
        date_cols.append(col)
        
# maybe change the date values to days of some sort
# consider benchmark to be 1st January 2021
today = datetime.datetime(2021, 1, 1)
dates_df = pd.Series(date_cols, index = date_cols)
mean_dates = []
for i in range(len(df)):
    for col in date_cols:
        dates_df[col] = df[col][i] # populate dates series for avg calculation
    dates_df = pd.to_datetime(dates_df)
    avg = (today - dates_df.mean()).days
    for col in date_cols:
        if pd.isnull(df[col][i]): # if null replace
            df[col][i] = avg
        else:
            df[col][i] = (today - df[col][i]).days
        
        
    
# Q columns
# change numbers usage patterns and periods are quantities

q_cols = [col for col in df if col.startswith('Q')]
# q_cols = c_cols[2:]
for col in q_cols:
    if is_number(df[col].any()): # check if number
#         print(col)
        df[col] = pd.to_numeric(df[col])
    else:
#         if 'False' or 'True' not in df[col]:
#             df[col] = df[col].astype('category') # transform to category
        df[col] = df[col].replace({'True' : True, 'False' : False})

# C columns
# don't change numbers, since they are contract references, non-quantity --> make everything category
# should C5-C7 be bool?
# C1 only has one value, we can drop

c_cols = [col for col in df if col.startswith('C')]
c = c_cols[:2] # first columns

for col in c:
    df[col] = pd.to_numeric(df[col])


c_cols = c_cols[2:]
# for col in c_cols:
#     df[col] = df[col].astype('category')
#     print(col, df[col].dtypes)   

df = df.drop(['C1'], axis=1) # C1 has same value all the time

df = df.convert_dtypes()

In [61]:
X = df.iloc[:, 1:].replace({pd.NA: np.nan})
y = pd.to_numeric(out['TARGET']).values

In [64]:
# regularization
# first do label encoding
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

# each column has different transform depending on their type

category_pipeline = Pipeline(steps=[
                                        ('c_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                        ('ohe', OneHotEncoder(sparse=False))
                                    ])

numerical_pipeline = Pipeline(steps=[
                                        ('n_imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
                                        ('Scaler', StandardScaler())
                                    ])


column_trans = ColumnTransformer([
                                    ('category_encoder', category_pipeline, make_column_selector(dtype_include='object')),
#                                     ('num_scaler', numerical_pipeline, make_column_selector(dtype_include=['int', 'float', 'bool'])),
                                ])

X = column_trans.fit_transform(X)

In [88]:
scores = []
clf = RandomForestClassifier(n_jobs=4, random_state=None)
cv = KFold(n_splits=10, shuffle=False)

for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

TypeError: __init__() got an unexpected keyword argument 'scoring'

In [92]:
from costcla.models import CostSensitiveRandomForestClassifier
scores = []
clf = RandomForestClassifier(n_jobs=4, random_state=None)
# clf = CostSensitiveRandomForestClassifier(n_estimators=4) # costcla method

cv = KFold(n_splits=10, shuffle=False)

for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))



Train Index:  [ 8553  8554  8555 ... 85526 85527 85528] 

Test Index:  [   0    1    2 ... 8550 8551 8552]


TypeError: fit() missing 1 required positional argument: 'cost_mat'

In [81]:
np.mean(scores)

0.7940580958524501

In [87]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])


0.5835571290109531

Collecting costcla
  Downloading costcla-0.6-py3-none-any.whl (4.0 MB)
Collecting pyea>=0.2
  Downloading pyea-0.2.tar.gz (10 kB)
Building wheels for collected packages: pyea
  Building wheel for pyea (setup.py): started
  Building wheel for pyea (setup.py): finished with status 'done'
  Created wheel for pyea: filename=pyea-0.2-py3-none-any.whl size=6021 sha256=2a926f9491815144df8810e900feb4e2083ce4b693811bab0a59f8d97ebe1050
  Stored in directory: c:\users\nicolas\appdata\local\pip\cache\wheels\c4\c7\f9\c43bd31860d7235d875091659066bf793ea300fd0621156737
Successfully built pyea
Installing collected packages: pyea, costcla
Successfully installed costcla-0.6 pyea-0.2
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Nicolas\anaconda3\python.exe -m pip install --upgrade pip' command.


In [76]:
list(zip(X_train[1:], clf.feature_importances_))

[(array([0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.]),
  0.00014284649362080742),
 (array([0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.

In [66]:
X[s_cols].astype('Int64')

  """Entry point for launching an IPython kernel.


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [68]:
set(X)

TypeError: unhashable type: 'numpy.ndarray'

In [42]:
set(X.dtypes)

{Int64Dtype(), BooleanDtype, dtype('float64'), StringDtype}

In [93]:
def func(x, y):
    return x*(1-x)*np.cos(4*np.pi*x) * np.sin(4*np.pi*y**2)**2

grid_x, grid_y = np.mgrid[0:1:100j, 0:1:200j]
points = np.random.rand(1000, 2)
values = func(points[:,0], points[:,1])


from scipy.interpolate import griddata
grid_z0 = griddata(points, values, (grid_x, grid_y), method='nearest')
grid_z1 = griddata(points, values, (grid_x, grid_y), method='linear')
grid_z2 = griddata(points, values, (grid_x, grid_y), method='cubic')

In [104]:
points

array([[0.74724354, 0.30748304],
       [0.17114126, 0.29916679],
       [0.91744402, 0.46062971],
       ...,
       [0.29909433, 0.75409262],
       [0.00507967, 0.32845911],
       [0.83048236, 0.32078835]])

In [100]:
data = [26, 29, 33, 37, 41, 48, 55, 62,	68,	74,	80,	83,	86,	88,	90,	92,
42,	48,	54,	61,	69,	80,	91,	103,	114,	124,	133,	139,	144,	148,	152,	156,
60,	70,	79,	90,	102,	117,	134,	153,	168,	182,	196,	204,	211,	219,	226,	231,
82,	95,	107,	122,	139,	160	,183,	208,	228,	248,	267,	276,	286,	297,	306,	314,
106,	124,	138,	157,	179,	205,	235,	267,	293,	318,	343,	358,	370,	380,	390,	398,
130,	152,	171,	196,	224,	256,	292,	332,	364,	394,	424,	442,	456,	466,	476,	485,
156	,183,	207,	236,	267,	307,	350,	398,	435,	470,	504,	525,	541,	554,	566,	576]

In [105]:
iFit = []
rows = 7
cols = 16
n = 0
for r in range(rows):
    sub = []
    for c in range(cols):
        sub.append(data[n])
        n += 1
    iFit.append(sub)