In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, \
    mean_absolute_percentage_error, accuracy_score, precision_score, \
        recall_score, f1_score, precision_recall_curve
from sklearn import preprocessing
import duckdb

In [None]:
data = pd.read_csv('../campaign_csvs/people_visited_comparison.csv')

data = duckdb.sql("""SELECT *, visits_A - visits_B AS diff FROM data WHERE visits_A > 0 AND visits_B > 0
                  AND ABS(diff) > 250000""").df()

sns.scatterplot(data, x='diff',y='A_won')

In [None]:
def Normalizer(df_cols):
    scaler = preprocessing.StandardScaler().fit(df_cols)
    return(scaler.transform(df_cols))

In [None]:
model = LogisticRegression().fit(Normalizer(data['diff'].values.reshape(-1,1)), data['A_won'].values.reshape(-1,1))
# print(model.coef_)
# print(model.intercept_)

# print(model.predict_proba([[10]]))

In [None]:
data['predictions'] = model.predict_proba(Normalizer(data['diff'].values.reshape(-1,1)))[:,1]
sns.scatterplot(data, x='diff',y='predictions', hue='A_won')

In [None]:

prediction_data = pd.read_csv('../campaign_csvs/election_prediction.csv')

print('2004:')
election_2004 = duckdb.sql("""SELECT state, evotes, Bush_2004, Kerry_2004, Bush_2004-Kerry_2004 AS diff_2004 FROM prediction_data
                           WHERE Bush_2004 > 0 AND Kerry_2004 > 0 AND NOT(state = 'District of Columbia')
                           AND NOT(state = 'New York')""").df()

election_2004['evotes_2004'] = election_2004['evotes']*model.predict_proba(Normalizer(election_2004['diff_2004'].values.reshape(-1,1)))[:,1]
bush_votes = np.sum(election_2004['evotes_2004'])
kerry_votes = np.sum(election_2004['evotes'])-np.sum(election_2004['evotes_2004'])
print(f'Bush: {bush_votes}')
print(f'Kerry: {kerry_votes}')

print('2008:')
election_2008 = duckdb.sql("""SELECT state, evotes, Obama_2008, Mccain_2008, Obama_2008-Mccain_2008 AS diff_2008 FROM prediction_data
                           WHERE Obama_2008 > 0 AND Mccain_2008 > 0""").df()

election_2008['evotes_2008'] = election_2008['evotes']*model.predict_proba(Normalizer(election_2008['diff_2008'].values.reshape(-1,1)))[:,1]
obama_votes = np.sum(election_2008['evotes_2008'])
mccain_votes = np.sum(election_2008['evotes'])-np.sum(election_2008['evotes_2008'])
print(f'Obama: {obama_votes}')
print(f'McCain: {mccain_votes}')

print('2012:')
election_2012 = duckdb.sql("""SELECT state, evotes, Obama_2012, Romney_2012, Obama_2012-Romney_2012 AS diff_2012 FROM prediction_data
                           WHERE Obama_2012 > 0 AND Romney_2012 > 0""").df()

election_2012['evotes_2012'] = election_2012['evotes']*model.predict_proba(Normalizer(election_2012['diff_2012'].values.reshape(-1,1)))[:,1]
obama_votes = np.sum(election_2012['evotes_2012'])
romney_votes = np.sum(election_2012['evotes'])-np.sum(election_2012['evotes_2012'])
print(f'Obama: {obama_votes}')
print(f'Romney: {romney_votes}')

print('2016:')
election_2016 = duckdb.sql("""SELECT state, evotes, Trump_2016, Clinton_2016, Trump_2016-Clinton_2016 AS diff_2016 FROM prediction_data
                           WHERE Trump_2016 > 0 AND Clinton_2016 > 0""").df()

election_2016['evotes_2016'] = election_2016['evotes']*model.predict_proba(Normalizer(election_2016['diff_2016'].values.reshape(-1,1)))[:,1]
trump_votes = np.sum(election_2016['evotes_2016'])
clinton_votes = np.sum(election_2016['evotes'])-np.sum(election_2016['evotes_2016'])
print(f'Trump: {trump_votes}')
print(f'Clinton: {clinton_votes}')

print('2020:')
election_2020 = duckdb.sql("""SELECT state, evotes, Trump_2020, Biden_2020, Biden_2020-Trump_2020 AS diff_2020 FROM prediction_data
                           WHERE Trump_2020 > 0 AND Biden_2020 > 0""").df()

election_2020['evotes_2020'] = election_2020['evotes']*model.predict_proba(Normalizer(election_2020['diff_2020'].values.reshape(-1,1)))[:,1]
biden_votes = np.sum(election_2020['evotes_2020'])
trump_votes = np.sum(election_2020['evotes'])-np.sum(election_2020['evotes_2020'])
print(f'Biden: {biden_votes}')
print(f'Trump: {trump_votes}')

print('2024:')
election_2024 = duckdb.sql("""SELECT state, evotes, Trump_2024, Harris_2024, Harris_2024-Trump_2024 AS diff_2024 FROM prediction_data
                           WHERE Trump_2024 > 0 AND Harris_2024 > 0""").df()

election_2024['evotes_2024'] = election_2024['evotes']*model.predict_proba(Normalizer(election_2024['diff_2024'].values.reshape(-1,1)))[:,1]
harris_votes = np.sum(election_2024['evotes_2024'])
trump_votes = np.sum(election_2024['evotes'])-np.sum(election_2024['evotes_2024'])
print(f'Harris: {harris_votes}')
print(f'Trump: {trump_votes}')

In [None]:
data_binary = pd.read_csv('../campaign_csvs/people_visited_comparison.csv')

data_binary = duckdb.sql("""SELECT *, visits_A - visits_B AS diff FROM data_binary WHERE visits_A > 0 AND visits_B > 0""").df()

filter = []
accuracy = []
count = [] 

for i in range(0, 500000, 10000):
    data_binary = data_binary[abs(data_binary['diff']) > i]

    model_binary = LogisticRegression().fit(Normalizer(data_binary['diff'].values.reshape(-1,1)), 
                                        data_binary['A_won'].values.reshape(-1,1))

    data_binary['predictions'] = model_binary.predict(Normalizer(data_binary['diff'].values.reshape(-1,1)))
    filter.append(i)
    accuracy.append(round(accuracy_score(data_binary[['predictions']], data_binary[['A_won']]),2))
    count.append(len(data_binary))
    # print(f'Filter: {i} Accuracy: {round(accuracy_score(data_binary[['predictions']], data_binary[['A_won']]),2)},\
    #       Count: {len(data_binary)}')

df = pd.DataFrame({'filter':filter,'accuracy':accuracy, 'count':count})
print(df)
sns.lineplot(df, x='filter',y='accuracy')

In [None]:
prediction_data = pd.read_csv('../campaign_csvs/election_prediction.csv')

election_2012 = duckdb.sql("""SELECT state, evotes, Obama_2012, Romney_2012, Obama_2012-Romney_2012 AS diff_2012 FROM prediction_data
                           WHERE Obama_2012 > 0 AND Romney_2012 > 0""").df()

data_binary = pd.read_csv('../campaign_csvs/people_visited_comparison.csv')

data_binary = duckdb.sql("""SELECT *, visits_A - visits_B AS diff FROM data_binary WHERE visits_A > 0 AND visits_B > 0""").df()

filter = []
obama = []
romney = [] 

for i in range(50000, 300000, 100):
    data_binary = data_binary[abs(data_binary['diff']) > i]

    model_binary = LogisticRegression().fit(Normalizer(data_binary['diff'].values.reshape(-1,1)), 
                                        data_binary['A_won'].values.reshape(-1,1))

    election_2012['evotes_2012'] = election_2012['evotes']*model_binary.predict_proba(Normalizer(election_2012['diff_2012'].values.reshape(-1,1)))[:,1]
    obama_votes = np.sum(election_2012['evotes_2012'])
    romney_votes = np.sum(election_2012['evotes'])-np.sum(election_2012['evotes_2012'])
    
    filter.append(i)
    obama.append(obama_votes)
    romney.append(romney_votes)

df = pd.DataFrame({'filter':filter,'romney':romney, 'obama':obama})
dfl = pd.melt(df, ['filter'])
print(df)
sns.lineplot(dfl, x='filter',y='value',hue='variable')