In [1]:
import wbgapi as wb
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.impute import KNNImputer

class WbDatalPuller():

    def __init__(self, indicators, year, impute=True):
        self.indicators = indicators
        self.year = year
        self.happiness_data = pd.read_csv('../data/happiness/happiness.csv').drop('Country', axis=1)
        self.valid_countries = self.happiness_data['ISO_A3'].unique()

        wb.db = 2
        self.data = self.pull_data()
        if impute:
            self.data = self.impute_numeric_data()

    def pull_wb_data(self):
        data = wb.data.DataFrame(self.indicators, time=self.year)
        return data
    
    def pull_data(self):
        print(f"Pulling {len(self.indicators)} indicators from World Bank data...")
        features = self.pull_wb_data()
        print("Done")
        features = features.reset_index()
        features = features.infer_objects()
        features = features.rename(columns={features.columns[0]: 'ISO_A3'})
        features = features[features['ISO_A3'].isin(self.valid_countries)]
        merged = pd.merge(features, self.happiness_data, on='ISO_A3')
        return merged
    
    def impute_numeric_data(self) -> pd.DataFrame:
        '''This code fills in missing numerical data with the mean of its 5 nearest neighbors
        as determined by its nonmissing numerical data. No categorical features are 
        touched.
        
        '''
        numeric_data = self.data.select_dtypes(include=['float64', 'int'])
        # print("Ndata Cols", numeric_data.columns)
        # print(numeric_data.columns.shape)
        # print(self.data.columns.shape)
        # print(len(self.data))
        # print(numeric_data.shape)
        imputed_data = KNNImputer().fit_transform(numeric_data.T).T
        self.data[numeric_data.columns] = imputed_data
        return self.data
    
    def get_data(self): 
        if self.data is None:
            self.data = self.pull_data()
        return self.data

    def check_missing(self, threshold):
        nans = self.data.isna().sum()
        threshold = 1 - threshold
        nthreshold = np.round(data.shape[0] * threshold)
        cols = nans[nans > nthreshold]
        print(f"The following features are less than {100*(1-threshold)}% complete:")
        for col in cols.index:
            pcomplete = 1 - (nans[col] / self.data.shape[0])
            print(f"   {col}: {pcomplete*100}% complete") 
        return cols
    
    def get_missing_percentages(self):
        nans = self.data.isna().sum()
        return nans / self.data.shape[0]

dp = WbDatalPuller(['NY.GDP.PCAP.CD', 'SP.POP.TOTL', 'SE.SEC.CMPT.LO.ZS', 'SE.XPD.TERT.PC.ZS', 'SE.XPD.TOTL.GB.ZS', 'AG.CON.FERT.PT.ZS'], 2022)
data = dp.get_data()

dp.check_missing(1.0)
print()



  from .autonotebook import tqdm as notebook_tqdm


Pulling 6 indicators from World Bank data...
Done
The following features are less than 100.0% complete:



In [2]:
def clean_features():
    entries = []
    indicators = wb.series.list()
    ids = [(str(indicator['id']), str(indicator['value'])) for indicator in indicators]
    pbar = tqdm(total=len(ids), position=0, leave=True)
    for id, value in ids:
        dp = WbDatalPuller([id], 2022)
        missing_percent = dp.get_missing_percentages()[id]
        entries.append([id, value, missing_percent])
        pbar.update()
    pbar.close()

    entries = pd.DataFrame(entries, columns=['id', 'value', 'missing_percent'])
    return entries

# feature_data = clean_features()
# feature_data.to_csv('feature_data.csv', index=False)

In [3]:
from model import Model

X = data.iloc[:, 1:-1].to_numpy()
y = data.iloc[:, -1].to_numpy()

model = Model(model_type='rdata', model_choice='reg', tuning_strategy='random')
model.make_quick_model()
model.make_full_model(X, y)

ValueError: Model type must be either 'clf' for classification or 'reg' for regression. Got reg

In [None]:
info = wb.series.info()
f = open('features.txt', 'w')
f.write(str(info))

131760

In [4]:
feature_data = pd.read_csv('feature_data.csv')
no_missing = feature_data[feature_data['missing_percent'] == 1.0]
no_missing = no_missing['id'].tolist()
no_missing = [str(id) for id in no_missing]

dp = WbDatalPuller(no_missing, 2022)



Pulling 433 indicators from World Bank data...


  pattern = '(?<!\w).{{0,{len}}}{term}.{{0,{len}}}(?!\w)'.format(term=re.escape(q), len=padding)


KeyboardInterrupt: 

In [5]:
# dp.get_data().to_csv('no_missing.csv', index=False)
data = pd.read_csv('no_missing.csv')

In [6]:
model = Model(model_type='xgb', model_choice='reg', tuning_strategy='bayesian')
X_train = data.iloc[:, 1:-1].to_numpy()
y_train = data.iloc[:, -1].to_numpy()

model.make_full_model(X_train, y_train)

ValueError: Model type must be either 'clf' for classification or 'reg' for regression. Got reg