In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the data
file_path = '/Users/merryxmm7/Documents/GitHub/ML_Project_Fashion_Trends/datascience_salaries.csv'
df_main = pd.read_csv(file_path)

# Drop unnecessary columns
df_main.drop(['salary', 'salary_currency', 'work_year'], axis=1, inplace=True)
#Convert the new dataframe to only US located companies
df_main = df_main[df_main['company_location'] == 'US']
# Define a function to perform one-hot encoding for categorical features
def one_hot_encode(df):
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    transformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')
    return transformer.fit_transform(df)

# Perform one-hot encoding for categorical features
X = one_hot_encode(df_main.drop('salary_in_usd', axis=1))
y = df_main['salary_in_usd']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Initialize and train Decision Tree Regression model
decision_tree_reg = DecisionTreeRegressor(random_state=42)
decision_tree_reg.fit(X_train, y_train)

# Initialize and train KNN Regression model
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)

# Function to evaluate models and print metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"R2 Score: {r2:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")

# Evaluate Linear Regression model
print("Linear Regression:")
evaluate_model(linear_reg, X_test, y_test)
print()

# Evaluate Decision Tree Regression model
print("Decision Tree Regression:")
evaluate_model(decision_tree_reg, X_test, y_test)
print()

# Evaluate KNN Regression model
print("KNN Regression:")
evaluate_model(knn_reg, X_test, y_test)


Linear Regression:
R2 Score: 0.26
Mean Squared Error: 3205508802.98
Mean Absolute Error: 41855.02

Decision Tree Regression:
R2 Score: 0.24
Mean Squared Error: 3307088252.49
Mean Absolute Error: 42941.29

KNN Regression:
R2 Score: 0.17
Mean Squared Error: 3620813676.50
Mean Absolute Error: 45072.07


In [3]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor

# Feature Scaling
scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Imbalanced Data Handling (if necessary)
# ros = RandomOverSampler(random_state=42)
# X_train_resampled, y_train_resampled = ros.fit_resample(X_train_scaled, y_train)

# Initialize and train Bagging Regressor model
bagging_reg = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, random_state=42)
bagging_reg.fit(X_train_scaled, y_train)

# Initialize and train AdaBoost Regressor model
adaboost_reg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, random_state=42)
adaboost_reg.fit(X_train_scaled, y_train)

# Initialize and train Gradient Boosting Regressor model
gradientboost_reg = GradientBoostingRegressor(n_estimators=100, random_state=42)
gradientboost_reg.fit(X_train_scaled, y_train)

# Evaluate models
print("Bagging Regressor:")
evaluate_model(bagging_reg, X_test_scaled, y_test)
print()

print("AdaBoost Regressor:")
evaluate_model(adaboost_reg, X_test_scaled, y_test)
print()

print("Gradient Boosting Regressor:")
evaluate_model(gradientboost_reg, X_test_scaled, y_test)




Bagging Regressor:
R2 Score: 0.25
Mean Squared Error: 3260406125.17
Mean Absolute Error: 42644.79

AdaBoost Regressor:
R2 Score: 0.15
Mean Squared Error: 3697994008.91
Mean Absolute Error: 46114.39

Gradient Boosting Regressor:
R2 Score: 0.26
Mean Squared Error: 3232991008.40
Mean Absolute Error: 42170.73


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline

# Load the data
file_path = '/Users/merryxmm7/Documents/GitHub/ML_Project_Fashion_Trends/datascience_salaries.csv'
df_main = pd.read_csv(file_path)

# Drop unnecessary columns
df_main.drop(['salary', 'salary_currency', 'work_year'], axis=1, inplace=True)

# Encode categorical variables with one-hot encoding
df_main_encoded = pd.get_dummies(df_main, drop_first=True)

# Define features (X) and target variable (y)
X = df_main_encoded.drop('salary_in_usd', axis=1)
y = df_main_encoded['salary_in_usd']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Engineering
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

# Model Selection and Hyperparameter Tuning (Random Forest)
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])

rf_param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 5, 10],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='r2', n_jobs=-1)
rf_grid_search.fit(X_train_poly, y_train)

# Evaluate Random Forest model
rf_best_model = rf_grid_search.best_estimator_
y_pred_rf = rf_best_model.predict(X_test_poly)
rf_r2 = r2_score(y_test, y_pred_rf)
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_mae = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest Regression:")
print(f"R2 Score: {rf_r2:.2f}")
print(f"Mean Squared Error: {rf_mse:.2f}")
print(f"Mean Absolute Error: {rf_mae:.2f}")
print()


KeyboardInterrupt: 

In [None]:
"""# the *: if we want to pass the value parameter, we have to pass it by name
# So its going to be more obvious in the code when we are creating a leaf node
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None,*, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value=None
        
    def is_leaf_node(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, min_sample_split=2, max_depth=100, n_features=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.root=None

    def fit(self, X,y):
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)

        self.root = self._grow_treee(X,y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape
        n_labels = len(np.unique(y
                                
        #check the stopping criteria
        if(depth>= self.max_depth or n_labels==1 or n_samples<self.min_samples_split):
           leaf_value = self.most_common_label(y)
           
           return Node(value=leaf_value)

        feat_idx = np.random.choice(n_feats, self.n_features, replace=False)
                                 
        #find the best split
        best_feature, best_thresh  = self._best_split(X, y, feat_idx)
        
        #create child nodes
    left_idxs, right_idxs = self._split(X[:,best_feature],best_thresh)
    left = self._grow_tree(X[left_idxs,:], y[left_idxs], depth + 1)
    
    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None

    for feat_idx, in feat_idxs:
        X_column = X[:, feat_idx]
        thresholds = np.unique(X_column)
        
        for thr in thresholds:
            #calculate the information gain
            gain = self._information_gain(y, X_column, thr)

            if gain > best_gain:
                best_gain = gain
                split_idx = feat_idx
                split_threshold = thr

    return split_idx, split_threshold

    def _information_gain(self, y, X_column, threshold):
        #parent entropy
        parent_entropy = self._entropy(y)

        #chreate children
        left_idx, right_idx = self._split(X_column, threshold)

    if len(left_idxs) == 0 or len(right_idxs) == :
        return 0

        #calculate the weighted  AVG. entropy of children
    n = len(y)
    n_l, n_r = len(left_idxs), len(right_idxs)
    e_l, e_r = self.entropy(y[left_idxs]), self.entropy(y[right_idxs])
    child_entropy = (n_l/n)*e_l + (n_r/n)* e_r

    #calculate the IG
    information_gain = parent_entropy - child_entropy
    return information_gain


    def _split(self, X_column, split_threshold)
        left_idxs = np.argwhere(X_column <= split_thresh).faltten
        right_idxs= np.argwhere(X_column > split_thresh).faltten
        return left_idxs, right_idxs

    
    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p*np.log(p) for p in ps if p>0])
    
    def most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value
                       

    def predict():


        """
      



In [None]:
df_main["company_location"].value_counts().head(20)