# linear-regression pridiction

In [2]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from evolution import rmsle

In [3]:
# Load the dataset
df = pd.read_csv("../data/train.csv", index_col='id')

# standardize the features and lebel the "Sex" features
df['Sex'] = df['Sex'].map({'F': 1, 'I': 2, 'M': 0})

# standardize the features
scaler = StandardScaler()
scaler.fit(df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])
df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']] = scaler.transform(df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])
df.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,0.278317,0.288912,0.382451,-0.038314,-0.060061,-0.227155,0.108309,11
1,1,0.955044,0.900996,0.250897,0.745005,0.573416,1.061143,0.722736,11
2,2,-3.020727,-2.975535,-2.906386,-1.678148,-1.640084,-1.649238,-1.69657,6
3,0,0.658976,0.747975,0.382451,0.27414,0.16985,0.357534,0.185113,10
4,2,0.320613,0.237905,-0.143763,-0.015371,0.140499,-0.09337,-0.218105,9


In [4]:
# splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(df[['Sex', 'Length', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']], df['Rings'], test_size=0.2, random_state=42)
# now let's train the classifier model
linreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

linreg.fit(df[['Sex', 'Length', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']].values, df['Rings'].values)


y_pred = linreg.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score:", accuracy)
print("Classification report:\n", classification_report(y_test, y_pred))
print("RMSLE score :", rmsle(y_test, y_pred))


Accuracy score: 0.3609777630635105
Classification report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         8
           3       0.00      0.00      0.00        77
           4       0.56      0.51      0.53       284
           5       0.46      0.52      0.49       596
           6       0.44      0.37      0.40      1088
           7       0.44      0.51      0.47      1781
           8       0.46      0.48      0.47      2947
           9       0.37      0.58      0.45      3482
          10       0.24      0.27      0.26      2454
          11       0.32      0.29      0.31      1636
          12       0.00      0.00      0.00       965
          13       0.15      0.23      0.18       786
          14       0.00      0.00      0.00       519
          15       0.00      0.00      0.00       416
          16       0.18      0.15      0.16       287
          17       0.1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# # choosing best parameters
# from itertools import combinations

# features = list(df.columns)
# features.remove('Rings')
# # features.remove('Sex')

# all_combinations = []
# for i in range(2, len(features)+1):
#     all_combinations.extend(combinations(features, i))

# print(len(all_combinations))

# # splitting the dataset

# min_rmsle_score = 10000
# best_features = None

# for f in all_combinations:
#     X_train, X_test, y_train, y_test = train_test_split(df[list(f)], df['Rings'], test_size=0.2, random_state=42)
#     linreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
#     linreg.fit(X_train, y_train)
#     y_pred = linreg.predict(X_test)
#     rmsle_score = rmsle(y_test, y_pred)
#     if rmsle_score < min_rmsle_score:
#         min_rmsle_score = rmsle_score
#         best_features = f

# print("Best features:", best_features)
# print("Min RMSLE score:", min_rmsle_score)


In [6]:
# let's store this model for future use
import pickle 

with open("../trained_models/LinearRegression.pkl", "wb") as f:
    pickle.dump(linreg, f)

In [7]:
# let's submit knn result to the kaggle competition

test_df = pd.read_csv("../data/test.csv", index_col="id")
test_df['Sex'] = test_df['Sex'].map({'F': 1, 'I': 2, 'M': 0})

test_df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']] = scaler.fit_transform(test_df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])

test_df['pred_Rings'] = linreg.predict(test_df[['Sex', 'Length', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])

test_df.to_csv('../data/modified_test.csv', index=False)

sub = pd.DataFrame({'id': test_df.index, 'Rings': test_df['pred_Rings']})
sub.to_csv("../submission/LinearRegression.csv", index=False)



In [8]:
# submitting to kaggle

!kaggle competitions submit -c playground-series-s4e4 -f ../submission/LinearRegression.csv -m "LinearRegression model submission"

Successfully submitted to Regression with an Abalone Dataset


  0%|          | 0.00/599k [00:00<?, ?B/s]
  1%|▏         | 8.00k/599k [00:00<00:43, 13.9kB/s]
100%|██████████| 599k/599k [00:03<00:00, 198kB/s]  





In [9]:
# find out the most useful features of naive bayes
