In [None]:
import wbgapi as wb
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

class DataPipeline():

    def __init__(self, indicators, year):
        self.indicators = indicators
        self.year = year

        self.happiness_data = pd.read_csv('../data/happiness/happiness.csv').drop('Country', axis=1)
        self.valid_countries = self.happiness_data['ISO_A3'].unique()

        wb.db = 2
        # print(wb.series.info())

    def pull_wb_data(self):
        data = wb.data.DataFrame(self.indicators, time=self.year)
        return data
    
    def get_data(self):
        features = self.pull_wb_data()
        features = features.reset_index()
        features = features.rename(columns={features.columns[0]: 'ISO_A3'})
        features = features[features['ISO_A3'].isin(self.valid_countries)]
        merged = pd.merge(features, self.happiness_data, on='ISO_A3')
        return merged
    
    def impute_numeric_data(self, data: pd.DataFrame) -> pd.DataFrame:
        '''This code fills in missing numerical data with the mean of its 5 nearest neighbors
        as determined by its nonmissing numerical data. No categorical features are 
        touched.
        
        '''
        # Alter only numerical data
        numeric_cols = data.select_dtypes(include=['float64', 'int']).columns.tolist()
        numeric_data = data[numeric_cols]

        # Imupte data
        imputed_data = KNNImputer().fit_transform(numeric_data)

        # Fill in original dataframe
        data[numeric_cols] = imputed_data

        return data
    
    def remove_outliers(self, data: pd.DataFrame, cutoff: float = 0.0) -> pd.DataFrame:
        """Remove outliers with an isolation forest
        
        Only numerical data is modified. All NaNs must already be removed.

        Parameters:
        ----------
            data  (pd.DataFrame): dataframe to alter
            cutoff (float): Cutoff for anomaly scores. Typically, anything less than 0 
                is considered an outlier


        """
        # Record original n_samples
        n_samples = data.shape[0]

        # Change only numeric data
        numeric_cols = data.select_dtypes(include=['float64', 'int']).columns.tolist()
        numeric_data = data[numeric_cols]
        
        # Create anomaly scores for dataframe
        forest = IsolationForest().fit(numeric_data)
        scores = forest.decision_function(numeric_data)

        # Filter out anomolous rows
        data = data[scores >= cutoff]

        # Print % of data removed
        print( (scores < cutoff).sum() / n_samples)

        return data
    
    def scale_to_standard(self, data: pd.DataFrame) -> pd.DataFrame:
        """Standardize features by making the mean 0 and the variance 1
        
        Only numerical data is modified. All NaNs must already be removed.

        Parameters:
        ----------
            data  (pd.DataFrame): dataframe to alter

        """
        # Change only numeric data
        numeric_cols = data.select_dtypes(include=['float64', 'int']).columns.tolist()
        numeric_data = data[numeric_cols]
        
        # Scale Data
        scaler = StandardScaler().fit(numeric_data)
        scaled_data = scaler.transform(numeric_data)

        # Scale original data
        data.loc[:, numeric_cols] = scaled_data

        return data
        
        

dp = DataPipeline(['NY.GDP.PCAP.CD', 'SP.POP.TOTL'], 2022)
data = dp.get_data()
print(data.head())

print(data.isna().sum())
print(data.dtypes)

# This code fills in missing numerical data with the mean of its 5 nearest neighbors
#   as determined by its nonmissing numerical data. No categorical features are 
#   touched.
numeric_cols = data.select_dtypes(include=['float64', 'int']).columns.tolist()
numeric_data = data[numeric_cols]
imputed_data = KNNImputer().fit_transform(numeric_data)
data[numeric_cols] = imputed_data


print(data.isna().sum())

  ISO_A3  NY.GDP.PCAP.CD  SP.POP.TOTL  Happiness score
0    AFG      352.603733   41128771.0           2.4038
1    ALB     6810.114041    2777689.0           5.1988
2    ARE    53707.980081    9441129.0           6.5760
3    ARG    13650.604629   46234830.0           5.9670
4    ARM     7018.051504    2780469.0           5.3986
ISO_A3             0
NY.GDP.PCAP.CD     2
SP.POP.TOTL        0
Happiness score    0
dtype: int64
ISO_A3              object
NY.GDP.PCAP.CD     float64
SP.POP.TOTL        float64
Happiness score    float64
dtype: object
ISO_A3             0
NY.GDP.PCAP.CD     0
SP.POP.TOTL        0
Happiness score    0
dtype: int64


In [None]:
from model import Model

X = data.iloc[:, 1:-1].to_numpy()
y = data.iloc[:, -1].to_numpy()

model = Model(model_type='rdf', model_choice='reg', tuning_strategy='random')
model.make_quick_model()
model.make_full_model(X, y)

  from .autonotebook import tqdm as notebook_tqdm


Now making RandomForestRegressor...
Training and hypertuning using Exhaustive Search...


KeyboardInterrupt: 

In [9]:
info = wb.series.info()
f = open('features.txt', 'w')
f.write(str(info))

131760