In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = 'training_data.csv'
data = pd.read_csv(file_path)

# Inspect the header of the dataset
data.head()

# Identifying categorical and numerical variables
categorical = data.columns.tolist()
numerical = data.columns.tolist()
target_var = data.columns[-1]

# Encode the categorical target variable
label_encoder = LabelEncoder()
data['increase_stock'] = label_encoder.fit_transform(data['increase_stock'])

# Splitting the dataset into features (X) and target variable (y)
X = data.drop('increase_stock', axis=1)
y = data['increase_stock']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Grid Search for parameter tuning
param_grid = {'n_estimators': [200, 300, 400], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5]}
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_params_grid = grid_search.best_params_
print("Best Parameters from Grid Search:", best_params_grid)

# Train the model with the best parameters found
best_gb_classifier = GradientBoostingClassifier(**best_params_grid, random_state=42)
best_gb_classifier.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = best_gb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)
print("Classification Report:\n", report)

# Perform cross-validation
cv_scores = cross_val_score(gb_classifier, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")




Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters from Grid Search: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 400}
Accuracy on Test Set: 0.875
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.62      0.61        50
           1       0.93      0.92      0.93       270

    accuracy                           0.88       320
   macro avg       0.76      0.77      0.77       320
weighted avg       0.88      0.88      0.88       320

Cross-validation scores: [0.921875   0.88671875 0.8984375  0.88671875 0.9140625 ]
Mean cross-validation score: 0.9015625


In [None]:
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the trained Gradient Boosting Classifier model
model_path = 'best_gb_classifier.pkl'
best_gb_classifier = joblib.load(model_path)

# Load the scaler used for the training data
scaler_path = 'scaler.pkl'
scaler = joblib.load(scaler_path)

# Load the test data
test_data_path = 'test_data.csv'
test_data = pd.read_csv(test_data_path)

# Ensure that test_data only contains features, similar to how X_train was structured
X_test = test_data.drop('increase_stock', axis=1, errors='ignore')

# Scale the test data using the loaded scaler
X_test_scaled = scaler.transform(X_test)

# Generate predictions
predictions = best_gb_classifier.predict(X_test_scaled)

# Convert predictions to the required format (a single row of comma-separated values)
predictions_str = ','.join(map(str, predictions))

# Write the predictions to a CSV file
# This file will have a single line with no header
output_file_path = 'predictions.csv'  # The output file path
with open(output_file_path, 'w') as f:
    f.write(predictions_str)

print(f"Predictions have been written to {output_file_path}")


Predictions have been written to predictions.csv


In [None]:
# Load the dataset
file_path = 'predictions.csv'
data = pd.read_csv(file_path)

In [None]:
data.head()

Unnamed: 0,0,1,0.1,0.2,0.3,0.4,1.1,0.5,0.6,0.7,...,0.317,0.318,0.319,0.320,1.73,0.321,0.322,0.323,0.324,0.325


In [None]:
# Print the number of rows and columns in the DataFrame
print(data.shape)


(0, 400)


In [None]:
# Print the first few rows of the DataFrame
print(data.head())

Empty DataFrame
Columns: [0, 1, 0.1, 0.2, 0.3, 0.4, 1.1, 0.5, 0.6, 0.7, 1.2, 1.3, 0.8, 0.9, 0.10, 1.4, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 1.5, 0.18, 0.19, 0.20, 0.21, 1.6, 0.22, 0.23, 0.24, 0.25, 0.26, 1.7, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 1.8, 1.9, 0.39, 0.40, 1.10, 0.41, 0.42, 0.43, 0.44, 1.11, 0.45, 0.46, 1.12, 0.47, 0.48, 0.49, 0.50, 1.13, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 1.14, 0.58, 0.59, 0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70, 0.71, 0.72, 0.73, 0.74, 1.15, 0.75, 0.76, 0.77, 0.78, 0.79, 0.80, 0.81, 1.16, 0.82, ...]
Index: []

[0 rows x 400 columns]
