In [141]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import pandas as pd
from tqdm.notebook import tqdm_notebook
from datasets import Dataset
import copy


def normalize(num):
    return num+1
teams_dict = {
    'Arsenal': ['Arsenal', 'ARS'],
    'Aston Villa': ['Aston Villa', 'AVL', 'AstonVilla', 'Villa'],
    'Barnsley': ['Barnsley', 'BAR'],
    'Birmingham City': ['Birmingham City', 'Birmingham', 'BIR'],
    'Blackburn Rovers': ['Blackburn Rovers', 'Blackburn', 'BLA'],
    'Blackpool': ['Blackpool', 'BLP'],
    'Bolton Wanderers': ['Bolton Wanderers', 'Bolton', 'BOL'],
    'Bournemouth': ['Bournemouth', 'BOU'],
    'Bradford City': ['Bradford City', 'Bradford', 'BFD'],
    'Brentford': ['Brentford', 'BRE'],
    'Brighton': ['Brighton & Hove Albion', 'Brighton', 'BHA', 'Brighton and Hove Albion'],
    'Burnley': ['Burnley', 'BUR'],
    'Cardiff City': ['Cardiff City', 'Cardiff', 'CAR'],
    'Charlton Athletic': ['Charlton Athletic', 'Charlton', 'CHA'],
    'Chelsea': ['Chelsea', 'CHE'],
    'Coventry City': ['Coventry City', 'Coventry', 'COV'],
    'Crystal Palace': ['Crystal Palace', 'CRY', 'CrystalPalace'],
    'Derby County': ['Derby County', 'Derby', 'DER'],
    'Everton': ['Everton', 'EVE'],
    'Fulham': ['Fulham', 'FUL'],
    'Huddersfield Town': ['Huddersfield Town', 'Huddersfield', 'HUD'],
    'Hull City': ['Hull City', 'Hull', 'HUL'],
    'Ipswich Town': ['Ipswich Town', 'Ipswich', 'IPS'],
    'Leeds United': ['Leeds United', 'Leeds', 'LEE'],
    'Leicester City': ['Leicester City', 'Leicester', 'LEI'],
    'Liverpool': ['Liverpool', 'LIV'],
    'Manchester City': ['Manchester City', 'Man City', 'MCI', 'ManCity'],
    'Manchester United': ['Manchester United', 'Man Utd', 'ManUnited', 'Man United', 'MUN'],
    'Middlesbrough': ['Middlesbrough', 'MID'],
    'Newcastle United': ['Newcastle United', 'Newcastle', 'NEW'],
    'Norwich City': ['Norwich City', 'Norwich', 'NOR', 'NorwichCity'],
    'Nottingham Forest': ['Nottingham Forest', 'Nottm Forest', 'NTF'],
    'Oldham Athletic': ['Oldham Athletic', 'Oldham', 'OLD'],
    'Portsmouth': ['Portsmouth', 'POR'],
    'Queens Park Rangers': ['Queens Park Rangers', 'QPR'],
    'Reading': ['Reading', 'RDG'],
    'Sheffield United': ['Sheffield United', 'Sheffield Utd', 'SU', 'SheffieldUnited'],
    'Sheffield Wednesday': ['Sheffield Wednesday', 'Sheffield Wed', 'SW', 'SheffieldWednesday'],
    'Southampton': ['Southampton', 'SOU'],
    'Stoke City': ['Stoke City', 'Stoke', 'STK'],
    'Sunderland': ['Sunderland', 'SUN'],
    'Swansea City': ['Swansea City', 'Swansea'],
    'Swindon Town': ['Swindon Town', 'SWI'],
    'Tottenham Hotspur': ['Tottenham Hotspur', 'Tottenham', 'TOT', 'Spurs'],
    'Watford': ['Watford', 'WAT'],
    'West Bromwich Albion': ['West Bromwich Albion', 'West Brom','WestBrom','WBA'],
    'West Ham United': ['West Ham United', 'West Ham', 'WHU', 'WestHam'],
    'Wigan Athletic': ['Wigan Athletic','WiganAthletic' ,'Wigan', 'WIG'],
    'Wimbledon': ['Wimbledon', 'WIM'],
    'Wolverhampton Wanderers': ['Wolverhampton Wanderers', 'Wolves', 'WOL']
}

In [142]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
fine_tuned_model = DistilBertForSequenceClassification.from_pretrained('./fine-tuned')
def sentiment_analysis(text_list):
    results = []

    for text in tqdm_notebook(text_list):
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

        # Get the model's output
        outputs = fine_tuned_model(**inputs)

        # Get the predicted class (positive or negative)
        _, predicted_class = torch.max(outputs.logits, dim=1)

        # Convert the predicted class index to sentiment labels
        if predicted_class.item()==0:
            sentiment_label='loss'
        elif predicted_class.item() ==1:
            sentiment_label='draw'
        else:
            sentiment_label = 'win'
        results.append(sentiment_label)

    return results

def addAway(df):
    df_swapped = df.copy()
    df_swapped.columns = ['tweet','date','away','home']
    df_swapped['tweet'] = df_swapped['tweet'].str.replace('home', 'Away_temp').str.replace('away', 'home').str.replace('Away_temp', 'away')
    combined_df = pd.concat([df, df_swapped], ignore_index=True)
    return combined_df




In [143]:
df_21 = addAway(pd.read_csv("21tweets.csv").drop('Unnamed: 0', axis =1))
df_20 = addAway(pd.read_csv("20tweets.csv").drop('Unnamed: 0', axis =1))
df_19 = addAway(pd.read_csv("19tweets.csv").drop('Unnamed: 0', axis =1))
df_18 = addAway(pd.read_csv("18tweets.csv").drop('Unnamed: 0', axis =1))

In [147]:
results = sentiment_analysis(list(df_21['tweet']))
df_21['sentiment'] = results
results = sentiment_analysis(list(df_20['tweet']))
df_20['sentiment'] = results
results = sentiment_analysis(list(df_19['tweet']))
df_19['sentiment'] = results
results = sentiment_analysis(list(df_18['tweet']))
df_18['sentiment'] = results

  0%|          | 0/35250 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
df21 = copy.deepcopy(df_21)
df20 = copy.deepcopy(df_20)
df19 = copy.deepcopy(df_19)
df18 = copy.deepcopy(df_18)

In [148]:
df21

Unnamed: 0,tweet,date,home,away,sentiment
0,home 2 away 1 a scorer toney,2021-08-13 18:59:56+00:00,brentford,arsenal,1
1,home 2 0 away toney double,2021-08-13 18:59:55+00:00,brentford,arsenal,1
2,they know they are winnning full time home 6 1...,2021-08-13 18:59:49+00:00,brentford,arsenal,1
3,home 0 vs 1 away ft,2021-08-13 18:59:10+00:00,brentford,arsenal,-1
4,sorry away fans i m predicting 3 0 to home,2021-08-13 18:59:07+00:00,brentford,arsenal,1
5,prediction home 1 2 away balogun first scorer,2021-08-13 18:58:56+00:00,brentford,arsenal,-1
6,home 4 0 away trust,2021-08-13 18:58:39+00:00,brentford,arsenal,1
7,3 1 home vs away,2021-08-13 18:58:36+00:00,brentford,arsenal,1
8,full time home 6 1 away,2021-08-13 18:58:20+00:00,brentford,arsenal,1
9,las minute prediction home 3 away 1,2021-08-13 18:58:10+00:00,brentford,arsenal,1


In [78]:
def numerical(sentiment):
    if sentiment=='win':
        return 1
    if sentiment =='draw':
        return 0
    if sentiment == 'loss':
        return -1
        
def find_team_number(team_name_):
    for i, team_list in enumerate(teams_dict.values()):
        if team_name_.lower() in [x.lower() for x in team_list]:
            return i
    return -1 # Return -1 if no match is found

In [68]:
df21['sentiment']=df21['sentiment'].apply(numerical)
df_21_sentiments = df21.groupby(['home', 'away']).agg({'sentiment': 'mean'}).reset_index()

df20['sentiment']=df20['sentiment'].apply(numerical)
df_20_sentiments = df20.groupby(['home', 'away']).agg({'sentiment': 'mean'}).reset_index()

df19['sentiment'] = df19['sentiment'].apply(numerical)
df_19_sentiments = df19.groupby(['home', 'away']).agg({'sentiment': 'mean'}).reset_index()

df18['sentiment'] = df18['sentiment'].apply(numerical)
df_18_sentiments = df18.groupby(['home', 'away']).agg({'sentiment': 'mean'}).reset_index()


In [113]:
def addSentiments(df,df_sentiments):
    df['home_num'] = df['Home Team'].apply(find_team_number)
    df['away_num'] = df['Away Team'].apply(find_team_number)
    df_sentiments['home_num'] = df_sentiments['home'].apply(find_team_number)
    df_sentiments['away_num'] = df_sentiments['away'].apply(find_team_number)
    sentiments = []
    for i,row in df.iterrows():
        current_sent= df_sentiments[(df_sentiments['home_num']== row['home_num'])&(df_sentiments['away_num']== row['away_num'] )]['sentiment'].values
        if (current_sent):
            sentiments.append(float(current_sent))
        else:
            sentiments.append(0.00)
    df['sentiments']=sentiments
    return df[['Home Team','Away Team','home_num','away_num','sentiment']]

In [117]:
df18_new = pd.read_csv("/Users/moksh/PL Predictions/FbrefDfs/FBref2018Season.csv")[['Home Team','Away Team','home_num','away_num','sentiments']]
df19_new = pd.read_csv("/Users/moksh/PL Predictions/FbrefDfs/FBref2019Season.csv")[['Home Team','Away Team','home_num','away_num','sentiments']]
df20_new = pd.read_csv("/Users/moksh/PL Predictions/FbrefDfs/FBref2020Season.csv")[['Home Team','Away Team','home_num','away_num','sentiments']]
df21_new = pd.read_csv("/Users/moksh/PL Predictions/FbrefDfs/FBref2021Season.csv")[['Home Team','Away Team','home_num','away_num','sentiments']]


In [119]:
df18_new.to_csv("/Users/moksh/PL Predictions/Twitter/sentiments/2018.csv")
df19_new.to_csv("/Users/moksh/PL Predictions/Twitter/sentiments/2019.csv")
df20_new.to_csv("/Users/moksh/PL Predictions/Twitter/sentiments/2020.csv")
df21_new.to_csv("/Users/moksh/PL Predictions/Twitter/sentiments/2021.csv")





In [107]:
fbref_18 = pd.read_csv("/Users/moksh/PL Predictions/FbrefDfs/FBref2018Season.csv")
df_fbref_18 = addSentiments(fbref_18, df_18_sentiments)
df_fbref_18.to_csv("/Users/moksh/PL Predictions/FbrefDfs/FBref2018Season.csv")



NameError: name 'df_17_sentiments' is not defined