In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../datasets/travel_insurance_us.csv')

train, valid = train_test_split(data, random_state=12345)

def target_features(df, tar_col, remove=[]):
    target = df[tar_col]
    remove.append(tar_col)
    features = df[[x for x in train.columns.values if x not in remove]]
    return target, features

remove_list = []
target_train, features_train = target_features(train, 'Claim', remove_list)
target_valid, features_valid = target_features(valid, 'Claim', remove_list)

print(features_train.shape[0])
print(features_valid.shape[0])


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('../datasets/travel_insurance_us.csv')

data_ohe = pd.get_dummies(data, drop_first=True)
target = data_ohe['Claim']
features = data_ohe.drop('Claim', axis=1)

target_train, target_valid, features_train, features_valid = train_test_split(target, features)

print('Trained!')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('travel_insurance_us.csv')

data_ohe = pd.get_dummies(data, drop_first=True)
target = data_ohe['Claim']
features = data_ohe.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345)

numeric = ['Duration', 'Net Sales', 'Commission (in value)', 'Age']

scaler = StandardScaler()
scaler.fit(features_train[numeric])
features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])

print(features_train.shape)

In [None]:
import pandas as pd

target = pd.Series([1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1])
predictions = pd.Series([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1])

print(((target==0) & (predictions==0)).sum())



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('../datasets/travel_insurance_us_preprocessed.csv')

target = data['Claim']
features = data.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train, target_train)

probabilities_one_valid = model.predict_proba(features_valid)[:, 1]

print(probabilities_one_valid[:5])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

data = pd.read_csv('../datasets/travel_insurance_us_preprocessed.csv')

target = data['Claim']
features = data.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train, target_train)

probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print(auc_roc)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('../datasets/flights.csv')

data_ohe = pd.get_dummies(data, drop_first=True)

target = data_ohe['Arrival Delay']
features = data_ohe.drop(['Arrival Delay'], axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

numeric = ['Day', 'Day Of Week', 'Origin Airport Delay Rate',
       'Destination Airport Delay Rate', 'Scheduled Time', 'Distance',
       'Scheduled Departure Hour', 'Scheduled Departure Minute']

pd.options.mode.chained_assignment = None
scaler = StandardScaler()
scaler.fit(features_train.loc[:, numeric])
features_train.loc[:,numeric] = scaler.transform(features_train.loc[:, numeric])
features_valid.loc[:, numeric] = scaler.transform(features_valid.loc[:, numeric])

print(features_train.shape)
print(features_valid.shape)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

data = pd.read_csv('../datasets/flights_preprocessed.csv')

target = data['Arrival Delay']
features = data.drop(['Arrival Delay'], axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = LinearRegression()
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
mse = mean_squared_error(target_valid, predicted_valid)

print('Linear Regression')
print('MSE =', mse)
print('RMSE =', mse ** 0.5)

mse = mean_squared_error(target_valid, [target_valid.mean() for x in target_valid])

print('Mean')
print('MSE =', mse)
print('RMSE =', mse ** 0.5)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

df = pd.read_csv('../datasets/flights_preprocessed.csv')

df_target = df['Arrival Delay']
df_features = df.drop('Arrival Delay', axis=1)
train, valid = train_test_split(df, test_size=0.25, random_state=1995)

target_train = train['Arrival Delay']
target_valid = valid['Arrival Delay']
features_train = train.drop('Arrival Delay', axis=1)
features_valid = valid.drop('Arrival Delay', axis=1)

model = RandomForestRegressor(max_depth=11, n_estimators=90, random_state=1995)


In [2]:
model.fit(features_train, target_train)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=90,
                      n_jobs=None, oob_score=False, random_state=1995,
                      verbose=0, warm_start=False)

In [7]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(df_target, model.predict(df_features))

23.622650569086826

In [8]:
dump(model, 'model.joblib')

['model.joblib']