In [1]:
class SkillAnalysisTab:
    def __init__(self, tab):
        self.tab = tab
        self.metrics_visible = False  # Flag to track visibility of evaluation metrics
        self.load_data_and_process()
        self.create_skills_analysis_tab()

    def load_data_and_process(self):
        # Check if the results file already exists
        results_file = 'results.json'
        if os.path.exists(results_file):
            # If the file exists, load the results from the file
            with open(results_file, 'r') as file:
                results = json.load(file)
                self.top_skills = results['top_skills']
                self.precision = results['precision']
                self.recall = results['recall']
                self.f1 = results['f1']
        else:
            # Load English tokenizer, tagger, parser, NER, and word vectors
            nlp = spacy.load("en_core_web_sm")

            # Load and prepare the skills from the Excel file
            df = pd.read_excel('all_emsi_skills.xlsx')  # Load hard skills from an Excel file
            known_skills = [skill.lower() for skill in df['name'].tolist()]

            def preprocess_skills(skills):
                patterns = []
                for skill in skills:
                    primary_skill = skill.split('(')[0].strip()
                    patterns.append(nlp.make_doc(primary_skill))
                return patterns

            def extract_skills(job_description):
                skills = set()
                doc = nlp(job_description.lower())
                matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
                patterns = preprocess_skills(known_skills)
                matcher.add("SKILL", patterns)
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start:end]
                    skills.add(span.text)
                return skills

            # Read CSV data into a DataFrame
            df = pd.read_csv('job_descriptions_corrected_dataset.csv')

            y_true = []
            y_pred = []
            for _, row in df.iterrows():
                description = row["job_description"]
                annotated_skills = row["annotated_hard_skills"].strip("[]").split(",")
                annotated_skills = {x.lstrip() for x in annotated_skills}
                extracted_skills = extract_skills(description)
                y_true.append(annotated_skills)
                y_pred.append(extracted_skills)
                


            y_true = [{item.lower() for item in s} for s in y_true]
            # If the file doesn't exist, run the model and save the results to the file
            # Read the new CSV file with unseen job postings
            new_df = pd.read_csv('job_descriptions.csv')

            # Initialize a list to store all extracted skills
            all_skills = []

            # Iterate over rows in the new DataFrame
            for _, row in new_df.iterrows():
                description = row["Descriptions"]  # Assuming the column name is "job description"
                
                # Extract skills from the description
                extracted_skills = extract_skills(description)
                
                # Remove "data science" from the extracted skills
                extracted_skills = [skill for skill in extracted_skills if skill.lower() != 'data science']
                
                # Append the extracted skills to the all_skills list
                all_skills.extend(extracted_skills)

            # Count the frequency of each skill
            skill_counts = Counter(all_skills)

            # Get the top 5 skills in demand
            self.top_skills = skill_counts.most_common(5)

            # Flatten the lists for evaluation
            true_skills_flat = [skill for skills in y_true for skill in skills]
            predicted_skills_flat = [skill for skills in y_pred for skill in skills]

            # Create binary labels for each skill in the union of all skills
            all_skills = set(true_skills_flat) | set(predicted_skills_flat)
            true_labels = [int(skill in true_skills_flat) for skill in all_skills]
            predicted_labels = [int(skill in predicted_skills_flat) for skill in all_skills]

            # Calculate evaluation metrics
            self.precision = precision_score(true_labels, predicted_labels)
            self.recall = recall_score(true_labels, predicted_labels)
            self.f1 = f1_score(true_labels, predicted_labels)

            # Save the results to a file
            results = {
                'top_skills': self.top_skills,
                'precision': self.precision,
                'recall': self.recall,
                'f1': self.f1
            }
            with open(results_file, 'w') as file:
                json.dump(results, file)

    def create_skills_analysis_tab(self):
        # Create a frame for the top 5 skills
        top_skills_frame = ttk.Frame(self.tab)
        top_skills_frame.pack(pady=10)

        # Display the top 5 skills
        top_skills_label = ttk.Label(top_skills_frame, text="Top 5 Skills in Demand:", font=("Arial", 16))
        top_skills_label.pack()

        for skill, count in self.top_skills:
            skill_label = ttk.Label(top_skills_frame, text=f"{skill}: {count}", font=("Arial", 12))
            skill_label.pack()

        # Create a frame for the bar graph
        graph_frame = ttk.Frame(self.tab)
        graph_frame.pack(pady=10)

        # Create a bar graph of the top 5 skills and their frequencies
        fig, ax = plt.subplots(figsize=(6, 4))
        skills, counts = zip(*self.top_skills)
        ax.bar(skills, counts)
        ax.set_xlabel("Skills")
        ax.set_ylabel("Frequency")
        ax.set_title("Top 5 Skills in Demand")

        # Rotate x-axis labels if needed
        plt.xticks(rotation=45, ha='right')

        # Adjust layout to prevent overlapping
        plt.tight_layout()

        # Create a canvas for the graph
        canvas = FigureCanvasTkAgg(fig, master=graph_frame)
        canvas.draw()

        # Pack the canvas
        canvas.get_tk_widget().pack()

        # Create a frame for the advanced button and evaluation metrics
        self.advanced_frame = ttk.Frame(self.tab)
        self.advanced_frame.pack(pady=10)

        # Create an advanced button
        self.advanced_button = ttk.Button(self.advanced_frame, text="Advanced", command=self.toggle_evaluation_metrics)
        self.advanced_button.pack()

        # Create a frame for the evaluation metrics
        self.metrics_frame = ttk.Frame(self.advanced_frame)

    def toggle_evaluation_metrics(self):
        if self.metrics_visible:
            self.metrics_frame.pack_forget()  # Hide the evaluation metrics
            self.metrics_visible = False
        else:
            self.show_evaluation_metrics()  # Show the evaluation metrics
            self.metrics_visible = True

    def show_evaluation_metrics(self):
        # Clear the existing evaluation metrics frame
        for widget in self.metrics_frame.winfo_children():
            widget.destroy()

        # Create labels for the evaluation metrics
        precision_label = ttk.Label(self.metrics_frame, text=f"Precision: {self.precision:.4f}")
        recall_label = ttk.Label(self.metrics_frame, text=f"Recall: {self.recall:.4f}")
        f1_label = ttk.Label(self.metrics_frame, text=f"F1-Score: {self.f1:.4f}")

        # Pack the labels
        precision_label.pack()
        recall_label.pack()
        f1_label.pack()

        # Pack the evaluation metrics frame
        self.metrics_frame.pack()

In [2]:
class MarketAnalysis:
    def __init__(self, api_key):
        self.fred = Fred(api_key=api_key)
        self.start_date = '1990-01-01'
        self.end_date = pd.Timestamp.now().strftime('%Y-%m-%d')
        self.indicators = ['AMVPNO', 'CES3133600101', 'CUUR0000SETC', 'DMOTRC1Q027SBEA', 'RSMVPD']
        self.data_df = self.fetch_data()
        self.X, self.y = self.prepare_data()
        self.X_train, self.X_test, self.y_train, self.y_test = self.split_data()
        self.scaler = MinMaxScaler()
        self.imputer = SimpleImputer(strategy='mean')
        self.X_train_scaled = self.preprocess_data(self.X_train)
        self.X_test_scaled = self.preprocess_data(self.X_test)
        self.nn_model = self.build_nn_model()
        self.rf_model = self.build_rf_model()
        self.train_nn_model()
        self.train_rf_model()
        self.nn_report, self.rf_report, self.latest_category = self.evaluate_models()

    def fetch_data(self):
        data_df = pd.DataFrame()
        for indicator in self.indicators:
            data = self.fred.get_series(indicator, observation_start=self.start_date, observation_end=self.end_date, frequency='q')
            data_df[indicator] = data.ffill().bfill()
        np.random.seed(42)
        data_df['GrowthCategory'] = np.random.choice([0, 1, 2, 3], size=len(data_df))
        data_df[self.indicators] = data_df[self.indicators].pct_change().fillna(0)
        return data_df

    def prepare_data(self):
        X = self.data_df[self.indicators].values
        y = self.data_df['GrowthCategory'].values
        return X, y

    def split_data(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.25, random_state=42)
        return X_train, X_test, y_train, y_test

    def preprocess_data(self, X):
        X_scaled = self.scaler.fit_transform(self.imputer.fit_transform(X))
        return X_scaled

    def build_nn_model(self):
        model = Sequential([
            Dense(128, input_dim=self.X_train_scaled.shape[1]),
            BatchNormalization(),
            Dropout(0.3),
            Dense(64, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(32, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(4, activation='softmax')
        ])
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def train_nn_model(self):
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(self.X_train_scaled, self.y_train)
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        self.nn_model.fit(X_train_resampled, to_categorical(y_train_resampled), epochs=200, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=2)

    def build_rf_model(self):
        rf = RandomForestClassifier(random_state=42)
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
        return grid_search

    def train_rf_model(self):
        self.rf_model.fit(self.X_train_scaled, self.y_train)

    def evaluate_models(self):
        nn_predictions = np.argmax(self.nn_model.predict(self.X_test_scaled), axis=1)
        nn_report = classification_report(self.y_test, nn_predictions)
        best_rf = self.rf_model.best_estimator_
        rf_predictions = best_rf.predict(self.X_test_scaled)
        rf_report = classification_report(self.y_test, rf_predictions)
        latest_data = self.data_df.iloc[-1]
        latest_indicators = latest_data[self.indicators].values.reshape(1, -1)
        latest_scaled_data = self.scaler.transform(self.imputer.transform(latest_indicators))
        latest_category = np.argmax(self.nn_model.predict(latest_scaled_data), axis=1)[0]
        return nn_report, rf_report, latest_category

    def save_results(self, filename='classification_results.pkl'):
        with open(filename, 'wb') as f:
            pickle.dump((self.nn_report, self.rf_report, self.latest_category), f)

    def load_results(self, filename='classification_results.pkl'):
        with open(filename, 'rb') as f:
            self.nn_report, self.rf_report, self.latest_category = pickle.load(f)

In [3]:
class JobPostingsAnalysis:
    def __init__(self, filepath='job_postings_by_sector_US.csv'):
        self.filepath = filepath
        self.job_postings = pd.read_csv(filepath, parse_dates=['date'])
        self.unique_jobs = self.job_postings['display_name'].unique()
        self.historical_data_sp500 = None
        self.historical_data_btc = None
        self.treasury_inflation_expectations = None
        self.inflation_expectations_5_years = None
        self.nominal_broad_dollar_index = None
        self.job_postings_indeed = None
        self.new_job_postings_indeed = None
        self.treasury_spread = None
        self.federal_funds_rate = None
        self.treasury_yield_10y = None
        self.equity_market_volatility = None
        self.high_yield_index_yield = None
        self.bank_prime_loan_rate = None


    def load_additional_data(self, fred_api_key):
        # Initialize the FRED API
        fred = Fred(api_key=fred_api_key)
        end_date = '2024-03-22'
        end_date_yfinance = '2024-03-23'
        
        # Load Job Postings on Indeed in the United States (IHLIDXUS)
        series_id = 'IHLIDXUS'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.job_postings_indeed = pd.DataFrame(data, columns=[series_id])

        # Load New Job Postings on Indeed in the United States (IHLIDXNEWUS)
        series_id = 'IHLIDXNEWUS'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.new_job_postings_indeed = pd.DataFrame(data, columns=[series_id])

        # Load 10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity
        series_id = 'T10Y2Y'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.treasury_spread = pd.DataFrame(data, columns=[series_id])

        # Load Federal Funds Effective Rate
        series_id = 'DFF'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.federal_funds_rate = pd.DataFrame(data, columns=[series_id])

        # Load Market Yield on U.S. Treasury Securities at 10-Year Constant Maturity, Quoted on an Investment Basis
        series_id = 'DGS10'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.treasury_yield_10y = pd.DataFrame(data, columns=[series_id])

        # Load Equity Market Volatility: Infectious Disease Tracker
        series_id = 'INFECTDISEMVTRACKD'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.equity_market_volatility = pd.DataFrame(data, columns=[series_id])

        # Load ICE BofA CCC & Lower US High Yield Index Effective Yield
        series_id = 'BAMLH0A3HYCEY'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.high_yield_index_yield = pd.DataFrame(data, columns=[series_id])

        # Load Bank Prime Loan Rate
        series_id = 'RIFSPBLPND'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.bank_prime_loan_rate = pd.DataFrame(data, columns=[series_id])


        # Load and prepare the Treasury Inflation Expectations data
        series_id = 'T10YIE'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.treasury_inflation_expectations = pd.DataFrame(data, columns=[series_id])
        self.treasury_inflation_expectations['ds'] = self.treasury_inflation_expectations.index
        self.treasury_inflation_expectations.rename(columns={series_id: 'treasury_inflation_expectations'}, inplace=True)

        # Load and prepare the 5-Year Inflation Expectations data
        series_id = 'T5YIFR'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.inflation_expectations_5_years = pd.DataFrame(data, columns=[series_id])

        # Load and prepare the Nominal Broad U.S. Dollar Index data
        series_id = 'DTWEXBGS'
        data = fred.get_series(series_id, '2020-02-01', end_date)
        self.nominal_broad_dollar_index = pd.DataFrame(data, columns=[series_id])

        # Load S&P 500 data
        sp500 = yf.Ticker('^GSPC')
        self.historical_data_sp500 = sp500.history(start='2020-02-01', end=end_date_yfinance)[['Close']]
        self.historical_data_sp500.index = pd.to_datetime(self.historical_data_sp500.index.strftime('%Y-%m-%d'))

        # Load Bitcoin data
        btc_usd = yf.Ticker('BTC-USD')
        self.historical_data_btc = btc_usd.history(start='2020-02-01', end=end_date_yfinance)[['Close']]
        self.historical_data_btc.index = pd.to_datetime(self.historical_data_btc.index.strftime('%Y-%m-%d'))

    def get_job_sector_data(self, sector):
        # Filter job postings for the specified sector
        data = self.job_postings[self.job_postings['display_name'] == sector][['date', 'indeed_job_postings_index']]
        data.rename(columns={'date': 'ds', 'indeed_job_postings_index': 'y'}, inplace=True)
        data['ds'] = pd.to_datetime(data['ds'].dt.strftime('%Y-%m-%d'))

        # Merge with S&P 500 data
        data = data.merge(self.historical_data_sp500, how='left', left_on='ds', right_index=True)
        data.rename(columns={'Close': 'sp500'}, inplace=True)

        # Merge with Bitcoin data
        data = data.merge(self.historical_data_btc, how='left', left_on='ds', right_index=True)
        data.rename(columns={'Close': 'btc_usd'}, inplace=True)

        # Merge with Treasury Inflation Expectations data
        data = data.merge(self.treasury_inflation_expectations, how='left', on='ds')

        # Merge with 5-Year Inflation Expectations data
        data = data.merge(self.inflation_expectations_5_years, how='left', left_on='ds', right_index=True)

        # Merge with Nominal Broad U.S. Dollar Index data
        data = data.merge(self.nominal_broad_dollar_index, how='left', left_on='ds', right_index=True)
        
        ###
        
        # Merge with Job Postings on Indeed in the United States (IHLIDXUS)
        data = data.merge(self.job_postings_indeed, how='left', left_on='ds', right_index=True)
        data.rename(columns={'Close': 'IHLIDXUS'}, inplace=True)
        # Merge with New Job Postings on Indeed in the United States (IHLIDXNEWUS)
        data = data.merge(self.new_job_postings_indeed, how='left', left_on='ds', right_index=True)

        # Merge with 10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity
        data = data.merge(self.treasury_spread, how='left', left_on='ds', right_index=True)

        # Merge with Federal Funds Effective Rate
        data = data.merge(self.federal_funds_rate, how='left', left_on='ds', right_index=True)

        # Merge with Market Yield on U.S. Treasury Securities at 10-Year Constant Maturity, Quoted on an Investment Basis
        data = data.merge(self.treasury_yield_10y, how='left', left_on='ds', right_index=True)

        # Merge with Equity Market Volatility: Infectious Disease Tracker
        data = data.merge(self.equity_market_volatility, how='left', left_on='ds', right_index=True)

        # Merge with ICE BofA CCC & Lower US High Yield Index Effective Yield
        data = data.merge(self.high_yield_index_yield, how='left', left_on='ds', right_index=True)

        # Merge with Bank Prime Loan Rate
        data = data.merge(self.bank_prime_loan_rate, how='left', left_on='ds', right_index=True)

        
        ###

        # Split data before filling missing values
        split_percentage = 0.8
        split_point = int(len(data) * split_percentage)
        training_data = data[:split_point]
        testing_data = data[split_point:]

        # Fill missing values in training data
        training_data.fillna(method='ffill', inplace=True)
        training_data.fillna(method='bfill', inplace=True)

        # Use the last value of the training data to fill initial NaNs in testing data
        for column in testing_data.columns:
            if pd.isnull(testing_data[column].iloc[0]):
                testing_data[column].iloc[0] = training_data[column].iloc[-1]

        # Then, forward fill the remaining NaNs in testing data
        testing_data.fillna(method='ffill', inplace=True)

        # Check for any NaN values in the training and testing data
        if training_data.isnull().any().any():
            raise ValueError("NaN values found in training data after processing.")
        if testing_data.isnull().any().any():
            raise ValueError("NaN values found in testing data after processing.")

        # Combine the data back
        data = pd.concat([training_data, testing_data])

        return data
    
    
    def perform_cross_validation(self, model, data):
        # Parameters for cross-validation
        cv_horizon = '90 days'  # Forecast horizon
        cv_initial = '180 days'  # Initial training period should capture at least one full seasonal cycle
        cv_period = '60 days'  # Spacing between cutoff dates for a good balance between granularity and computational efficiency

        # Perform cross-validation
        df_cv = cross_validation(model, initial=cv_initial, period=cv_period, horizon=cv_horizon)

        # Compute performance metrics
        df_p = performance_metrics(df_cv)

        # Return the aggregated RMSE (or another metric of your choice)
        return df_p['rmse'].mean()



    
    def optimize_hyperparameters(self, data):  
    
        def drop_highly_correlated(df, threshold):
            # Create correlation matrix
            corr_matrix = df.corr().abs()
            # Select upper triangle of correlation matrix
            upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
            # Find index of feature columns with correlation greater than the threshold
            to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
            # Drop features
            df = df.drop(df[to_drop], axis=1)
            return df
    
        best_mae_train = float('inf')
        best_mae_test = float('inf')
        best_scale = 0.05
        best_regressors = None
        best_model = None
        best_forecast = None
        split_percentage = 0.8
        best_rmse = float('inf')
        best_test_mae = 0 # THIS IS A TEST
        best_train_mae = 9999 #THIS IS A TEST
        best_params = {}


        regressors = ['sp500', 'btc_usd', 'treasury_inflation_expectations', 'T5YIFR', 'DTWEXBGS', 'IHLIDXUS', 'IHLIDXNEWUS', 'T10Y2Y', 'DFF', 'DGS10', 'INFECTDISEMVTRACKD', 'BAMLH0A3HYCEY', 'RIFSPBLPND']
        
        data_with_regressors = data[['ds'] + regressors]
        #print(data_with_regressors)
        data_with_regressors = drop_highly_correlated(data_with_regressors, threshold=0.6)
        filtered_regressors = list(data_with_regressors.columns)
        filtered_regressors.remove('ds')

        regressor_combinations = sum([list(itertools.combinations(filtered_regressors, i)) for i in range(1, len(filtered_regressors) + 1)], [])
        
        for combination in regressor_combinations:
            changepoint_prior_scale_value = 10
            previous_mae = float('inf')
            iterations = 0

            while changepoint_prior_scale_value >= 0.001 and iterations < 5:  # Increased the minimum value and reduced iterations
                # Initialize the model
                model = Prophet(changepoint_prior_scale=changepoint_prior_scale_value, n_changepoints=5)

                # Remove other seasonality components
                model.add_seasonality(name='weekly', period=7, fourier_order=3)
                model.add_seasonality(name='monthly', period=30.5, fourier_order=5)

                #manually add customized seasonalities with limited Fourier terms
                model.add_seasonality(name='yearly', period=365.25, fourier_order=3)  # Fewer Fourier terms for yearly seasonality


                # Add regressors
                for regressor in combination:
                    model.add_regressor(regressor)

                # Split data
                split_point = int(len(data) * split_percentage)
                training_data = data[:split_point]
                testing_data = data[split_point:]

                # Fit the model
                model.fit(training_data)

                # Perform cross-validation and get the average RMSE
                avg_rmse = self.perform_cross_validation(model, data)

                # Create future dataframe
                future = model.make_future_dataframe(periods=len(testing_data))
                for regressor in combination:
                    future = future.merge(data[['ds', regressor]], on='ds', how='left')

                # Predict the future
                forecast = model.predict(future)

                # Evaluate the model
                y_true_train = training_data['y'].values
                y_pred_train = forecast.iloc[:split_point]['yhat'].values
                mae_train = mean_absolute_error(y_true_train, y_pred_train)

                y_true_test = testing_data['y'].values
                y_pred_test = forecast.iloc[split_point:]['yhat'].values
                mae_test = mean_absolute_error(y_true_test, y_pred_test)

                # Calculate MAPE for training and testing
                mape_train = np.mean(np.abs((y_true_train - y_pred_train) / y_true_train)) * 100
                mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100

                # Calculate RMSE for training and testing
                rmse_train = np.sqrt(mean_squared_error(y_true_train, y_pred_train))
                rmse_test = np.sqrt(mean_squared_error(y_true_test, y_pred_test))


                # Compare the average RMSE to find the best model
                if avg_rmse < best_rmse: 
                    best_test_mae = mae_test
                    best_train_mae = mae_train
                    best_rmse = avg_rmse
                    best_mae_train = mae_train
                    best_mae_test = mae_test
                    best_scale = changepoint_prior_scale_value
                    best_regressors = combination
                    best_model = model
                    best_forecast = forecast
                    best_mape_train = mape_train
                    best_mape_test = mape_test
                    best_rmse_train = rmse_train
                    best_rmse_test = rmse_test

                # Adaptive step size based on improvement
                if (previous_mae - mae_test) / previous_mae < 0.01:
                    changepoint_prior_scale_value *= 0.5
                    iterations += 1
                else:
                    previous_mae = mae_test

                changepoint_prior_scale_value -= changepoint_prior_scale_value * 0.5

        ###

        data_dict = {
            'sp500': self.historical_data_sp500,
            'btc_usd': self.historical_data_btc,
            'treasury_inflation_expectations': self.treasury_inflation_expectations,
            'T5YIFR': self.inflation_expectations_5_years,
            'DTWEXBGS': self.nominal_broad_dollar_index,
            'IHLIDXUS': self.job_postings_indeed,
            'IHLIDXNEWUS': self.new_job_postings_indeed,
            'T10Y2Y': self.treasury_spread,
            'DFF': self.federal_funds_rate,
            'DGS10': self.treasury_yield_10y,
            'INFECTDISEMVTRACKD': self.equity_market_volatility,
            'BAMLH0A3HYCEY': self.high_yield_index_yield,
            'RIFSPBLPND': self.bank_prime_loan_rate
        }
        
        # Initialize an empty dictionary to hold the forecasts
        forecasts_dict = {}
        merged_df = pd.DataFrame()

        for regressor, data in data_dict.items():
            # Ensure the data starts from 2020-02-01
            start_date = pd.Timestamp('2020-02-01')
            end_date = data.index.max()
            
            # Create a date range from 2020-02-01 to the end of the dataset
            full_date_range = pd.date_range(start=start_date, end=end_date, freq='D')

            data = data.reindex(full_date_range).fillna(method='ffill').fillna(method='bfill')

            print(data.head())
            # Split the data into training and test sets
            split_percentage = 0.8
            split_point = int(len(data) * split_percentage)

            
            train = data[:split_point]
            test = data[split_point:]

            # Reset the index to move 'Date' from the index to a column
            train = train.reset_index()
            test = test.reset_index()

            # Get the last column name (the one to be predicted)
            # Assuming df is your DataFrame
            if 'ds' in train.columns:
                train.reset_index(drop=True)
                train = train.set_index('ds')
                test.reset_index(drop=True)
                test = test.set_index('ds')

            train.columns = ['ds', 'y']
            test.columns = ['ds', 'y']
            # Rename the 'Date' column to 'ds' and the target column to 'y'
            #train.rename(columns={'Date': 'ds', target_column: 'y'}, inplace=True)
            #test.rename(columns={'Date': 'ds', target_column: 'y'}, inplace=True)
            
            # Ensure that 'ds' is of datetime type
            train['ds'] = pd.to_datetime(train['ds'])
            test['ds'] = pd.to_datetime(test['ds'])

            # Initialize and fit the Prophet model
            model = Prophet(changepoint_prior_scale=0.5)
            model.fit(pd.concat([train, test]))

            # Create a dataframe for future predictions
            future = model.make_future_dataframe(periods=90)

            # Make the predictions
            forecast = model.predict(future)

            # Store the forecast in the dictionary
            forecasts_dict[regressor] = forecast['yhat']
            # Add the regressor name as a new column in the forecast DataFrame
            forecast = forecast[['ds', 'yhat']]
            forecast.rename(columns={'yhat':regressor}, inplace=True)

            # If merged_df is empty, assign the forecast DataFrame to it
            #print(regressor)
            #print("---------------------FORECAST 2----------------------")
            #print(forecast)
            #print("---------------------FORECAST 2----------------------")
            if merged_df.empty:
                merged_df = forecast
            else:
                # Merge the current forecast DataFrame into the merged_df DataFrame on 'ds'
                merged_df = pd.merge(merged_df, forecast, on='ds', how='outer')

        merged_df['ds'] = pd.to_datetime(merged_df['ds'])




        ###
        
        # Prepare the future dataframe for prediction
        extended_future = best_model.make_future_dataframe(periods=len(testing_data) + 90)
        # Select the columns to merge based on the regressors list
        columns_to_merge = ['ds'] + regressors

        # Merge the selected columns from merged_df to extended_future
        extended_future = extended_future.merge(merged_df[columns_to_merge], on='ds', how='left')
        
        extended_future.rename(columns={'T10YIE': 'treasury_inflation_expectations'}, inplace=True)

        # Predict the future
        extended_forecast = best_model.predict(extended_future)
        #print("EXTENDED FORECAST")
        #print(extended_forecast)

        return {
                'avg_rmse': best_rmse,  # Include the best average RMSE here
                'scale': best_scale,
                'mae_train': best_mae_train,
                'mae_test': best_mae_test,
                'mape_train': best_mape_train,
                'mape_test': best_mape_test,
                'rmse_train': best_rmse_train,
                'rmse_test': best_rmse_test,
                'regressors': best_regressors,
                'model': best_model,
                'forecast': best_forecast,
                'extended_forecast': extended_forecast,
                'params': best_params,
            }

    def plot_job_postings_forecast(self, ax, sector):
        # Load additional data if not already loaded
        if self.historical_data_sp500 is None:
            self.load_additional_data(fred_api_key='3fda1d45198afb430e77220ef20d9de0')

        # Now you can safely use get_job_sector_data
        data = self.get_job_sector_data(sector)

        # Then, optimize the hyperparameters for the Prophet model based on the sector data
        best_scale, best_mae_train, best_mae_test, best_regressors, best_model, best_forecast = self.optimize_hyperparameters(data)

        # Plotting the observed values
        #ax.plot(data['ds'], data['y'], label='Observed', color='blue')

        # Plotting the forecasted values
        ax.plot(best_forecast['ds'], best_forecast['yhat'], label='Forecasted', color='orange')

        # Filling the area between the upper and lower confidence intervals
        ax.fill_between(best_forecast['ds'], best_forecast['yhat_lower'], best_forecast['yhat_upper'], color='gray', alpha=0.2)

        # Setting labels and title
        ax.set_xlabel('Date')
        ax.set_ylabel('Job Postings Index')
        ax.set_title(f'Forecasted Job Postings Index for {sector}')
        ax.legend()

        # Show the plot
        plt.show()
        

    def run_analysis_for_all_sectors(self, valid_sectors):
        self.best_parameters = {}
        self.load_additional_data(fred_api_key='3fda1d45198afb430e77220ef20d9de0')  # Ensure data is loaded
        for sector in valid_sectors:
            # Ensure data for each sector is processed
            if not os.path.exists(f'best_parameters_{sector}.joblib'):
                data = self.get_job_sector_data(sector)
                results = self.optimize_hyperparameters(data)
                self.best_parameters[sector] = results
                joblib.dump(results, f'best_parameters_{sector}.joblib')
            else:
                results = joblib.load(f'best_parameters_{sector}.joblib')

            data = self.get_job_sector_data(sector)
            self.best_parameters[sector] = {
                'avg_rmse': results['avg_rmse'],  # Include the best average RMSE here
                'scale': results['scale'],
                'mae_train': results['mae_train'],
                'mae_test': results['mae_test'],
                'mape_train': results['mape_train'],  # Ensure this key is correctly named
                'mape_test': results['mape_test'],
                'rmse_train': results['rmse_train'],
                'rmse_test': results['rmse_test'],
                'regressors': results['regressors'],
                'model': results['model'],
                'forecast': results['forecast'],
                'extended_forecast': results['extended_forecast'],
            }

            # Save the model and parameters to a file
            joblib.dump(self.best_parameters[sector], f'best_parameters_{sector}.joblib')
            print(f"The best average RMSE for {sector} is: {results['avg_rmse']}")




    def get_best_parameters_for_sector(self, sector):
        return self.best_parameters.get(sector, None)
    
    
    def compare_last_values(self, sector):
        best_parameters = self.get_best_parameters_for_sector(sector)
        if best_parameters is not None:
            # Get the last value in the test data
            last_test_value = best_parameters['forecast']['yhat'].iloc[-1]

            # Get the last value in the extended forecast
            last_forecast_value = best_parameters['extended_forecast']['yhat'].iloc[-1]

            # Calculate the percentage difference
            percentage_difference = ((last_forecast_value - last_test_value) / last_test_value) * 100

            # Compare the percentage difference with 2%
            if percentage_difference > 2:
                return "There will be a positive impact on job posting index"
            elif percentage_difference < -2:
                return "There will be a negative impact on job posting index"
            else:
                return "Job posting index will be stable"
        else:
            return "No data available for this sector"


In [None]:
import matplotlib
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score
import re
from prophet import Prophet
from bs4 import BeautifulSoup
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from fredapi import Fred
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tkinter as tk
import requests
from sklearn.preprocessing import StandardScaler, MinMaxScaler
matplotlib.use("TkAgg")
from sklearn.ensemble import VotingClassifier
from spacy.matcher import PhraseMatcher
import random
from sklearn.model_selection import TimeSeriesSplit
from tkinter import ttk
import pandas as pd
import joblib
from prophet.diagnostics import cross_validation, performance_metrics
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import spacy
from tensorflow.keras.utils import to_categorical
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
from tensorflow.keras.callbacks import EarlyStopping
from collections import defaultdict
import json
import itertools
import yfinance as yf
from tensorflow.keras.models import Sequential
import nltk
import os
from imblearn.over_sampling import SMOTE
import string
from collections import Counter


def create_skills_analysis_tab(parent):
    skill_analysis_tab = SkillAnalysisTab(parent)

# Function to create the Industry Analysis tab
def create_industry_analysis_tab(parent):
    # Create an instance of the MarketAnalysis class
    market_analysis = MarketAnalysis(api_key='3fda1d45198afb430e77220ef20d9de0')

    # Create a label for the latest category
    latest_category_label = ttk.Label(parent, text="")
    latest_category_label.pack(side=tk.TOP)

    # Create a frame for the content
    content_frame = ttk.Frame(parent)
    content_frame.pack(fill=tk.BOTH, expand=True)

    canvas = tk.Canvas(content_frame)
    canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

    scrollbar = ttk.Scrollbar(content_frame, command=canvas.yview)
    scrollbar.pack(side=tk.RIGHT, fill='y')

    canvas.configure(yscrollcommand=scrollbar.set)
    canvas.bind('<Configure>', lambda e: canvas.configure(scrollregion=canvas.bbox("all")))

    frame = ttk.Frame(canvas)
    canvas.create_window((0, 0), window=frame, anchor='nw')

    # Neural Network Classification Report
    nn_label = tk.Label(frame, text="Neural Network Classification Report:")
    nn_label.pack()
    nn_text = tk.Text(frame, height=10, width=80)
    nn_text.pack()

    # Random Forest Classification Report
    rf_label = tk.Label(frame, text="\nRandom Forest Classification Report with Hyperparameter Tuning:")
    rf_label.pack()
    rf_text = tk.Text(frame, height=10, width=80)
    rf_text.pack()

    # Update the GUI with the results
    category_map = {0: "Declining", 1: "Slow Growth", 2: "Moderate Growth", 3: "Fast Growth"}
    latest_category_text = category_map[market_analysis.latest_category]
    latest_category_label.config(text=f"Latest Category: {latest_category_text}")
    nn_text.delete('1.0', tk.END)
    nn_text.insert(tk.END, market_analysis.nn_report)
    rf_text.delete('1.0', tk.END)
    rf_text.insert(tk.END, market_analysis.rf_report)



def create_job_posting_index_tab(parent):
    job_index_result_text = tk.StringVar()
    job_index_result_label = tk.Label(parent, textvariable=job_index_result_text, font=('Helvetica', 10))
    job_index_result_label.pack(pady=(5, 10))
    #valid_sectors = ["Administrative Assistance", "Accounting", "Beauty & Wellness", "Physicians & Surgeons", "Media & Communications"]
    valid_sectors = ["Administrative Assistance"]
    #valid_sectors = ["Administrative Assistance"]
    
    analysis = JobPostingsAnalysis()
    analysis.run_analysis_for_all_sectors(valid_sectors)

    def update_job_postings(sector):
        # Compare the last values and update the result text
        result_text = analysis.compare_last_values(sector)
        job_index_result_text.set(result_text)

        # Update the top graph based on the selected sector
        fig.clear()
        best_parameters = analysis.get_best_parameters_for_sector(sector)
        if best_parameters is not None:
            data = analysis.get_job_sector_data(sector)  # Get the data for the selected sector
            ax = fig.add_subplot(111)
            ax.plot(data['ds'], data['y'], label='Actual', color='blue')
            ax.plot(best_parameters['extended_forecast']['ds'], best_parameters['extended_forecast']['yhat'], label='Final Model', color='green')
            ax.set_xlabel('Date')
            ax.set_ylabel('Job Postings Index')
            ax.set_title(f'Job Postings Index for {sector}')
            ax.legend()
            fig_canvas.draw()

        # Update the bottom graph based on the selected sector
        fig2.clear()
        if best_parameters is not None:
            split_percentage = 0.7
            train_data = data[:int(len(data) * split_percentage)]
            test_data = data[int(len(data) * split_percentage):]
            ax2 = fig2.add_subplot(111)
            ax2.plot(train_data['ds'], train_data['y'], label='Training Data', color='blue')
            ax2.plot(best_parameters['forecast']['ds'][:len(train_data)], best_parameters['forecast']['yhat'][:len(train_data)], label='Best Model (Training)', color='orange')
            ax2.plot(test_data['ds'], test_data['y'], label='Testing Data', color='green')
            ax2.plot(best_parameters['forecast']['ds'][len(train_data):], best_parameters['forecast']['yhat'][len(train_data):], label='Best Model (Testing)', color='red')
            ax2.set_xlabel('Date')
            ax2.set_ylabel('Job Postings Index')
            ax2.set_title(f'Best Model Performance for {sector}')
            ax2.legend()
            canvas2.draw()            
            
            


    job_sector_var = tk.StringVar()
    job_sector_var.set('Select Sector')
    tk.Label(parent, text="Choose a job sector:").pack()
    popupMenu = tk.OptionMenu(parent, job_sector_var, *valid_sectors, command=update_job_postings)
    popupMenu.pack()

    fig = plt.Figure(figsize=(8, 6), dpi=100)
    fig_canvas = FigureCanvasTkAgg(fig, master=parent)  
    fig_canvas.draw()
    fig_canvas.get_tk_widget().pack()

    # Advanced section
    advanced_frame = tk.Frame(parent)  # Create a new frame for the advanced section

    def show_advanced_info():
        # Hide or show advanced information based on current state
        if advanced_frame.winfo_ismapped():
            advanced_frame.pack_forget()
            advanced_button.configure(text="Advanced ▲")
        else:
            sector = job_sector_var.get()
            best_parameters = analysis.get_best_parameters_for_sector(sector)
            if best_parameters is not None:
                try:
                    training_mape_label.config(text=f"Training MAPE: {best_parameters['mape_train']}%")  # Use 'mape_train' here
                    testing_mape_label.config(text=f"Testing MAPE: {best_parameters['mape_test']}%")
                    training_rmse_label.config(text=f"Training RMSE: {best_parameters['rmse_train']}")
                    testing_rmse_label.config(text=f"Testing RMSE: {best_parameters['rmse_test']}")
                    training_mae_label.config(text="Training MAE: " + str(best_parameters['mae_train']))
                    testing_mae_label.config(text="Testing MAE: " + str(best_parameters['mae_test']))
                    regressors_list.delete(0, tk.END)  # Clear the listbox
                    for regressor in best_parameters['regressors']:
                        regressors_list.insert(tk.END, regressor)  # Insert each regressor into the listbox
                except KeyError:
                    print(f"Key not found for sector: '{sector}'")
                    training_mape_label.config(text="")
                    testing_mape_label.config(text="")
                    training_rmse_label.config(text="")
                    testing_rmse_label.config(text="")
                    training_mae_label.config(text="")
                    testing_mae_label.config(text="")
                    regressors_list.delete(0, tk.END)  # Clear the listbox
            advanced_frame.pack()
            advanced_button.configure(text="Advanced ▼")

            print(f"Selected sector: '{sector}'")  # Debug print
            print(f"Available keys: {list(analysis.best_parameters.keys())}")  # Debug print

        # Update the scroll region after a small delay
        root.after(100, lambda: canvas.configure(scrollregion=canvas.bbox("all")))



    # Button to toggle advanced information
    advanced_button = tk.Button(parent, text="Advanced ▲", command=show_advanced_info)
    advanced_button.pack(pady=(10, 5))

    # Frame for advanced information
    advanced_frame = tk.Frame(parent)

    # Initially hide advanced frame
    advanced_frame.pack_forget()
    
    training_mape_label = tk.Label(parent, text="")
    training_mape_label.pack()

    testing_mape_label = tk.Label(parent, text="")
    testing_mape_label.pack()

    training_rmse_label = tk.Label(parent, text="")
    training_rmse_label.pack()

    testing_rmse_label = tk.Label(parent, text="")
    testing_rmse_label.pack()

    # Text labels for training and testing MAE
    training_mae_label = tk.Label(advanced_frame, text="Training MAE: 0.25")
    training_mae_label.pack()
    testing_mae_label = tk.Label(advanced_frame, text="Testing MAE: 0.30")
    testing_mae_label.pack()

    # List for the best regressors
    best_regressors_label = tk.Label(advanced_frame, text="Best Regressors:")
    best_regressors_label.pack()
    regressors_list = tk.Listbox(advanced_frame)
    regressors_list.insert(1, "Regressor 1")
    regressors_list.insert(2, "Regressor 2")
    regressors_list.insert(3, "Regressor 3")
    regressors_list.pack()

    # Another line graph
    fig2 = plt.Figure(figsize=(8, 6), dpi=100)
    plot2 = fig2.add_subplot(111)
    plot2.plot([1, 2, 3, 4], [50, 60, 70, 80])
    canvas2 = FigureCanvasTkAgg(fig2, master=advanced_frame)
    canvas2.draw()
    canvas2.get_tk_widget().pack()


    
def program_not_done(tab):
    label = ttk.Label(tab, text="Due to time constraints this program wasn't finished", font=("Arial", 20))
    label.pack(expand=True, padx=20, pady=20)




# Main Tkinter window setup
root = tk.Tk()
root.title("Market Analysis Dashboard")
root.geometry("1000x800")
root.resizable(width=0, height=0)

canvas = tk.Canvas(root)
canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

scrollbar = ttk.Scrollbar(root, command=canvas.yview)
scrollbar.pack(side=tk.RIGHT, fill='y')

canvas.configure(yscrollcommand=scrollbar.set)
canvas.bind('<Configure>', lambda e: canvas.configure(scrollregion=canvas.bbox("all")))

frame = ttk.Frame(canvas)
canvas.create_window((90, 0), window=frame, anchor='nw')

tab_control = ttk.Notebook(frame)
tab1 = ttk.Frame(tab_control)
tab2 = ttk.Frame(tab_control)
tab3 = ttk.Frame(tab_control)

# Assuming create_skills_analysis_tab and create_industry_analysis_tab are defined
create_skills_analysis_tab(tab1)
program_not_done(tab2)
create_job_posting_index_tab(tab3)

tab_control.add(tab1, text='Skills Analysis')
tab_control.add(tab2, text='Industry Analysis')
tab_control.add(tab3, text='Job Posting Index')

tab_control.pack(expand=1, fill='both')

root.mainloop()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data.fillna(method='bfill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_data.fillna(method='ffill', inplace=True)
11:58:48 - cmdstanpy - INFO - Chain [1] start processing
11:58:48 - cmdstanpy - INFO - Chain [1] done processing
Seasonality has period of 365.25 days which is larger than initial window. Consider increasing initial.


  0%|          | 0/16 [00:00<?, ?it/s]

11:58:48 - cmdstanpy - INFO - Chain [1] start processing
11:58:48 - cmdstanpy - INFO - Chain [1] done processing
11:58:49 - cmdstanpy - INFO - Chain [1] start processing
11:58:49 - cmdstanpy - INFO - Chain [1] done processing
11:58:49 - cmdstanpy - INFO - Chain [1] start processing
11:58:49 - cmdstanpy - INFO - Chain [1] done processing
11:58:49 - cmdstanpy - INFO - Chain [1] start processing
11:58:49 - cmdstanpy - INFO - Chain [1] done processing
11:58:50 - cmdstanpy - INFO - Chain [1] start processing
11:58:50 - cmdstanpy - INFO - Chain [1] done processing
11:58:50 - cmdstanpy - INFO - Chain [1] start processing
11:58:50 - cmdstanpy - INFO - Chain [1] done processing
11:58:51 - cmdstanpy - INFO - Chain [1] start processing
11:58:51 - cmdstanpy - INFO - Chain [1] done processing
11:58:51 - cmdstanpy - INFO - Chain [1] start processing
11:58:51 - cmdstanpy - INFO - Chain [1] done processing
11:58:51 - cmdstanpy - INFO - Chain [1] start processing
11:58:51 - cmdstanpy - INFO - Chain [1]

  0%|          | 0/16 [00:00<?, ?it/s]

11:58:56 - cmdstanpy - INFO - Chain [1] start processing
11:58:56 - cmdstanpy - INFO - Chain [1] done processing
11:58:57 - cmdstanpy - INFO - Chain [1] start processing
11:58:57 - cmdstanpy - INFO - Chain [1] done processing
11:58:57 - cmdstanpy - INFO - Chain [1] start processing
11:58:57 - cmdstanpy - INFO - Chain [1] done processing
11:58:57 - cmdstanpy - INFO - Chain [1] start processing
11:58:58 - cmdstanpy - INFO - Chain [1] done processing
11:58:58 - cmdstanpy - INFO - Chain [1] start processing
11:58:58 - cmdstanpy - INFO - Chain [1] done processing
11:58:58 - cmdstanpy - INFO - Chain [1] start processing
11:58:58 - cmdstanpy - INFO - Chain [1] done processing
11:58:59 - cmdstanpy - INFO - Chain [1] start processing
11:58:59 - cmdstanpy - INFO - Chain [1] done processing
11:58:59 - cmdstanpy - INFO - Chain [1] start processing
11:58:59 - cmdstanpy - INFO - Chain [1] done processing
11:59:00 - cmdstanpy - INFO - Chain [1] start processing
11:59:00 - cmdstanpy - INFO - Chain [1]

  0%|          | 0/16 [00:00<?, ?it/s]

11:59:05 - cmdstanpy - INFO - Chain [1] start processing
11:59:05 - cmdstanpy - INFO - Chain [1] done processing
11:59:05 - cmdstanpy - INFO - Chain [1] start processing
11:59:06 - cmdstanpy - INFO - Chain [1] done processing
11:59:06 - cmdstanpy - INFO - Chain [1] start processing
11:59:06 - cmdstanpy - INFO - Chain [1] done processing
11:59:06 - cmdstanpy - INFO - Chain [1] start processing
11:59:06 - cmdstanpy - INFO - Chain [1] done processing
11:59:07 - cmdstanpy - INFO - Chain [1] start processing
11:59:07 - cmdstanpy - INFO - Chain [1] done processing
11:59:07 - cmdstanpy - INFO - Chain [1] start processing
11:59:07 - cmdstanpy - INFO - Chain [1] done processing
11:59:08 - cmdstanpy - INFO - Chain [1] start processing
11:59:08 - cmdstanpy - INFO - Chain [1] done processing
11:59:08 - cmdstanpy - INFO - Chain [1] start processing
11:59:08 - cmdstanpy - INFO - Chain [1] done processing
11:59:08 - cmdstanpy - INFO - Chain [1] start processing
11:59:09 - cmdstanpy - INFO - Chain [1]

                  Close
2020-02-01  3248.919922
2020-02-02  3248.919922
2020-02-03  3248.919922
2020-02-04  3297.590088
2020-02-05  3334.689941


11:59:13 - cmdstanpy - INFO - Chain [1] start processing
11:59:14 - cmdstanpy - INFO - Chain [1] done processing


                  Close
2020-02-01  9392.875000
2020-02-02  9344.365234
2020-02-03  9293.521484
2020-02-04  9180.962891
2020-02-05  9613.423828


11:59:15 - cmdstanpy - INFO - Chain [1] start processing
11:59:16 - cmdstanpy - INFO - Chain [1] done processing


            treasury_inflation_expectations         ds
2020-02-01                             1.63 2020-02-03
2020-02-02                             1.63 2020-02-03
2020-02-03                             1.63 2020-02-03
2020-02-04                             1.64 2020-02-04
2020-02-05                             1.66 2020-02-05


11:59:17 - cmdstanpy - INFO - Chain [1] start processing
11:59:17 - cmdstanpy - INFO - Chain [1] done processing


            T5YIFR
2020-02-01    1.70
2020-02-02    1.70
2020-02-03    1.70
2020-02-04    1.70
2020-02-05    1.71


11:59:18 - cmdstanpy - INFO - Chain [1] start processing
11:59:19 - cmdstanpy - INFO - Chain [1] done processing


            DTWEXBGS
2020-02-01  116.1176
2020-02-02  116.1176
2020-02-03  116.1176
2020-02-04  115.9290
2020-02-05  116.0082


11:59:20 - cmdstanpy - INFO - Chain [1] start processing
11:59:21 - cmdstanpy - INFO - Chain [1] done processing


            IHLIDXUS
2020-02-01    100.00
2020-02-02     99.98
2020-02-03     99.97
2020-02-04    100.03
2020-02-05    100.12


11:59:22 - cmdstanpy - INFO - Chain [1] start processing
11:59:23 - cmdstanpy - INFO - Chain [1] done processing


            IHLIDXNEWUS
2020-02-01       100.00
2020-02-02       100.86
2020-02-03       101.71
2020-02-04       101.99
2020-02-05       102.21


11:59:24 - cmdstanpy - INFO - Chain [1] start processing
11:59:25 - cmdstanpy - INFO - Chain [1] done processing


            T10Y2Y
2020-02-01    0.18
2020-02-02    0.18
2020-02-03    0.18
2020-02-04    0.20
2020-02-05    0.22


11:59:26 - cmdstanpy - INFO - Chain [1] start processing
11:59:27 - cmdstanpy - INFO - Chain [1] done processing


             DFF
2020-02-01  1.59
2020-02-02  1.59
2020-02-03  1.59
2020-02-04  1.59
2020-02-05  1.59


11:59:28 - cmdstanpy - INFO - Chain [1] start processing
11:59:29 - cmdstanpy - INFO - Chain [1] done processing


            DGS10
2020-02-01   1.54
2020-02-02   1.54
2020-02-03   1.54
2020-02-04   1.61
2020-02-05   1.66


11:59:30 - cmdstanpy - INFO - Chain [1] start processing
11:59:31 - cmdstanpy - INFO - Chain [1] done processing


            INFECTDISEMVTRACKD
2020-02-01                8.36
2020-02-02                3.20
2020-02-03                8.18
2020-02-04                5.50
2020-02-05                2.71


11:59:32 - cmdstanpy - INFO - Chain [1] start processing
11:59:33 - cmdstanpy - INFO - Chain [1] done processing


            BAMLH0A3HYCEY
2020-02-01          11.84
2020-02-02          11.84
2020-02-03          11.84
2020-02-04          11.66
2020-02-05          11.57


11:59:33 - cmdstanpy - INFO - Chain [1] start processing
11:59:34 - cmdstanpy - INFO - Chain [1] done processing


            RIFSPBLPND
2020-02-01        4.75
2020-02-02        4.75
2020-02-03        4.75
2020-02-04        4.75
2020-02-05        4.75


11:59:35 - cmdstanpy - INFO - Chain [1] start processing
11:59:36 - cmdstanpy - INFO - Chain [1] done processing
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data.fillna(method='bfill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_data.fillna(method='ffill', inplace=True)


The best average RMSE for Administrative Assistance is: 62.385545396940024


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data.fillna(method='bfill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_data.fillna(method='ffill', inplace=True)


Selected sector: 'Administrative Assistance'
Available keys: ['Administrative Assistance']
