In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold,GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.impute import MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import PowerTransformer,StandardScaler

In [2]:
df=pd.read_csv('hacktrain.csv')
df_test= pd.read_csv('hacktest.csv')

In [3]:
target_col=df['class']
feature_cols=df.drop(['class','ID','Unnamed: 0'],axis=1)
feature_cols.fillna(np.nan, inplace=True)

In [4]:
knn_imputer = KNNImputer(n_neighbors=5)
X = knn_imputer.fit_transform(feature_cols)
original_cols = feature_cols.columns.tolist()
X = pd.DataFrame(X, columns=original_cols)
ndvi_cols = [col for col in X.columns if not '_missing' in col]

In [5]:
scaler = StandardScaler()
X[ndvi_cols] = scaler.fit_transform(X[ndvi_cols])

In [6]:
first_quarter15=X.loc[:, ['20150330_N','20150314_N','20150226_N','20150210_N','20150125_N','20150109_N']]
first_quarter14=X.loc[:, ['20140322_N','20140218_N','20140202_N','20140117_N','20140101_N']]
second_quarter15=X.loc[:, ['20150602_N','20150517_N','20150501_N','20150415_N']]
second_quarter14=X.loc[:, ['20140626_N','20140610_N','20140525_N','20140509_N','20140423_N','20140407_N']]
third_quarter15=X.loc[:, ['20150720_N']]
third_quarter14=X.loc[:, ['20140930_N','20140813_N']]
fourth_quarter14=X.loc[:, ['20141117_N','20141101_N','20141016_N']]
X['ndvi_std'] = X[ndvi_cols].std(axis=1)
X['ndvi_min'] = X[ndvi_cols].min(axis=1)
X['ndvi_max'] = X[ndvi_cols].max(axis=1)
X['skew']= X[ndvi_cols].skew(axis=1)
X['kurtosis'] = X[ndvi_cols].kurtosis(axis=1)
X['first_quarter15_mean'] = first_quarter15.mean(axis=1)
X['first_quarter14_mean'] = first_quarter14.mean(axis=1)
X['second_quarter15_mean'] = second_quarter15.mean(axis=1)
X['fourth_quarter14_mean'] = fourth_quarter14.mean(axis=1)
X['range']=X[ndvi_cols].max(axis=1) - X[ndvi_cols].min(axis=1)

In [7]:
X_train = X
y_train = target_col

X_test= df_test.drop(['ID','Unnamed: 0'],axis=1)
# X_test = itr_imputer.transform(X_test)
X_test = knn_imputer.transform(X_test)
original_cols = feature_cols.columns.tolist()

X_test = pd.DataFrame(X_test, columns=original_cols)
X_test[ndvi_cols] = scaler.transform(X_test[ndvi_cols])

X_test['ndvi_std'] = X_test[ndvi_cols].std(axis=1)
X_test['ndvi_min'] = X_test[ndvi_cols].min(axis=1)
X_test['ndvi_max'] = X_test[ndvi_cols].max(axis=1)
first_quarter15=X_test.loc[:, ['20150330_N','20150314_N','20150226_N','20150210_N','20150125_N','20150109_N']]
first_quarter14=X_test.loc[:, ['20140322_N','20140218_N','20140202_N','20140117_N','20140101_N']]
second_quarter15=X_test.loc[:, ['20150602_N','20150517_N','20150501_N','20150415_N']]
second_quarter14=X_test.loc[:, ['20140626_N','20140610_N','20140525_N','20140509_N','20140423_N','20140407_N']]
third_quarter15=X_test.loc[:, ['20150720_N']]
third_quarter14=X_test.loc[:, ['20140930_N','20140813_N']]
fourth_quarter14=X_test.loc[:, ['20141117_N','20141101_N','20141016_N']]

X_test['skew']= X_test[ndvi_cols].skew(axis=1)
X_test['kurtosis'] = X_test[ndvi_cols].kurtosis(axis=1)
X_test['first_quarter15_mean'] = first_quarter15.mean(axis=1)
X_test['first_quarter14_mean'] = first_quarter14.mean(axis=1)
X_test['second_quarter15_mean'] = second_quarter15.mean(axis=1)
X_test['fourth_quarter14_mean'] = fourth_quarter14.mean(axis=1)
X_test['range']=X_test[ndvi_cols].max(axis=1) - X_test[ndvi_cols].min(axis=1)

In [8]:
model = LogisticRegression(max_iter=5000,penalty='l1',C=0.1,solver='saga')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print(f"Average CV score: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

Cross-validation scores: [0.9225   0.915    0.91     0.920625 0.918125]
Average CV score: 0.92 (+/- 0.01)


In [9]:
model.fit(X_train, y_train)
df_test['class'] = model.predict(X_test)

In [10]:
submission = df_test[['ID', 'class']]
submission.to_csv('submit7.csv', index=False)