In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler

filepaths = [   "Ultimate_AAPL_filtered.csv",    
                "Ultimate_TSLA_filtered.csv",    
                "Ultimate_MSFT_filtered.csv",    
                "Ultimate_GOOG_filtered.csv",    
                "Ultimate_AMZN_filtered.csv",]

scaler = MinMaxScaler()
knn_reg = KNeighborsRegressor(n_neighbors=5)

# Train the model using the first four datasets
X_train_list = []
y_train_list = []

for filepath in filepaths[:-1]:
    df = pd.read_csv(filepath)

    X = df[["comment_num", "retweet_num", "like_num", "follower_count", "compound_score", "Close_price-today", "Close_price-tmr"]]
    y = df['Close_price-2days']

    X_scaled = scaler.fit_transform(X)
    X_train_list.append(X_scaled)
    y_train_list.append(y.values)

X_train = np.vstack(X_train_list)
y_train = np.concatenate(y_train_list)

knn_reg.fit(X_train, y_train)

# Test the model on the fifth dataset
df_test = pd.read_csv(filepaths[-1])

X_test = df_test[["comment_num", "retweet_num", "like_num", "follower_count", "compound_score", "Close_price-today", "Close_price-tmr"]]
y_test = df_test['Close_price-2days']

X_test_scaled = scaler.transform(X_test)

test_score = knn_reg.score(X_test_scaled, y_test)

print("Test score on the fifth dataset:", test_score)


Test score on the fifth dataset: -1.1454711269496993


In [5]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor

filepaths = [    "Ultimate_AAPL_filtered.csv",    "Ultimate_TSLA_filtered.csv",    "Ultimate_MSFT_filtered.csv",    "Ultimate_GOOG_filtered.csv",    "Ultimate_AMZN_filtered.csv",]

scaler = MinMaxScaler()
knn_reg = KNeighborsRegressor()

# Train the model using the first four datasets
X_train_list = []
y_train_list = []

for filepath in filepaths[:-1]:
    df = pd.read_csv(filepath)

    X = df[["comment_num", "retweet_num", "like_num", "follower_count", "compound_score", "Close_price-today", "Close_price-tmr"]]
    y = df['Close_price-2days']

    X_scaled = scaler.fit_transform(X)
    X_train_list.append(X_scaled)
    y_train_list.append(y.values)

X_train = np.vstack(X_train_list)
y_train = np.concatenate(y_train_list)

# Hyperparameter tuning using GridSearchCV
params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}
grid = GridSearchCV(knn_reg, params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
knn_reg_best = grid.best_estimator_

# Ensemble methods using BaggingRegressor
bagging_reg = BaggingRegressor(base_estimator=knn_reg_best, n_estimators=10, random_state=0)
bagging_reg.fit(X_train, y_train)

# Test the model on the fifth dataset
df_test = pd.read_csv(filepaths[-1])

X_test = df_test[["comment_num", "retweet_num", "like_num", "follower_count", "compound_score", "Close_price-today", "Close_price-tmr"]]
y_test = df_test['Close_price-2days']

X_test_scaled = scaler.transform(X_test)

test_score = bagging_reg.score(X_test_scaled, y_test)

print("Test score on the fifth dataset:", test_score)


Test score on the fifth dataset: -0.21682671202757486


In [4]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import BaggingRegressor

scaler = StandardScaler()

params = {
    'knn__n_neighbors': range(3, 11),
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2],
    'knn__algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'selectkbest__k': range(3, 8)
}

knn_reg = KNeighborsRegressor()
selectkbest = SelectKBest(f_regression)

pipe = Pipeline([
    ('scaler', scaler),
    ('selectkbest', selectkbest),
    ('knn', knn_reg)
])

# Use RandomizedSearchCV instead of GridSearchCV
random_search = RandomizedSearchCV(pipe, params, n_iter=50, cv=5, n_jobs=-1, verbose=1, random_state=42)
print("Starting random search...")
random_search.fit(X_train, y_train)
print("Random search complete.")

knn_reg_best = random_search.best_estimator_
print("Best estimator found:", knn_reg_best)

bagging_reg = BaggingRegressor(base_estimator=knn_reg_best, n_estimators=10, random_state=0)
print("Fitting bagging regressor...")
bagging_reg.fit(X_train, y_train)
print("Bagging regressor fit complete.")

df_test = pd.read_csv(filepaths[-1])
X_test = df_test[["comment_num", "retweet_num", "like_num", "follower_count", "compound_score", "Close_price-today", "Close_price-tmr"]]
y_test = df_test['Close_price-2days']

X_test_scaled = scaler.transform(X_test)

test_score = bagging_reg.score(X_test_scaled, y_test)

print("Test score on the fifth dataset:", test_score)


Starting random search...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 