# Exercises Hand-In 3 e2
## Group 30 (Oliver Nilsson)

In [6]:
# Import required libraries
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import torch
from torch import cuda
import matplotlib.pyplot as plt
import nltk
from nltk.stem.snowball import SnowballStemmer
import re

# Print the versions of the libraries to check if they are installed correctly
print(f"Pandas version: {pd.__version__}")
print(f"Sklearn version: {sklearn.__version__}")
print(f"NLTK version: {nltk.__version__}")
print(f"Re version: {re.__version__}")
print(f"Torch version: {torch.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")

Pandas version: 2.2.2
Sklearn version: 1.4.2
NLTK version: 3.8.1
Re version: 2.2.1
Torch version: 2.3.0+cu121
Matplotlib version: 3.9.0


In [None]:
# Test if GPU is available
torch.cuda.get_device_name(0)

In [2]:
# Import csv file to a pandas dataframe
df_tweets = pd.read_csv('Users/gusnilolaa/data/1377884570_tweet_global_warming.csv', encoding='ISO-8859-1', engine='python')
df_tweets.dropna(inplace=True)  # Drop rows with missing values

# Replace Yes/Y with 1 and No/N with 0
df_tweets['existence'] = df_tweets['existence'].map({'Y': 1, 'Yes': 1, 'N': 0, 'No': 0}).astype(int)

# Remove "[link]"" from the tweets
df_tweets['tweet'] = df_tweets['tweet'].replace('\\[link\\]', '', regex=True)

# Display the first 5 rows of the dataframe
df_tweets.head()

Unnamed: 0,tweet,existence,existence.confidence
0,Global warming report urges governments to act...,1,1.0
1,Fighting poverty and global warming in Africa,1,1.0
2,Carbon offsets: How a Vatican forest failed to...,1,0.8786
3,Carbon offsets: How a Vatican forest failed to...,1,1.0
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,1,0.8087


In [3]:
# Split the data randomly into a test and a training set (70/30 % of the observations)
# Using random_state as seed for reproducibility
train_df, test_df = train_test_split(df_tweets, test_size=0.3, random_state=42)

train_df.head()

Unnamed: 0,tweet,existence,existence.confidence
230,Ocean Saltiness Shows Global Warming Is Intens...,1,1.0
498,RT @panteraonca07: Slideshow of Alaska Before ...,1,1.0
2510,"@prismsinc Worlds Greenest Celebrity: Limos, P...",1,0.6499
5115,FRIDAY AFTERNOON IGNORANCE-OFF: Virginia GOP (...,1,0.6717
3370,RT @mmfa: Brain Freeze: Conservative media sti...,1,0.6969


## 2. Fine-tune a BERT model to predict non-climate sceptic language using the Augmented Social Scientist package

### a)

In [None]:
# Define required functions
def alpha_only(text):
    # Only keep alphabetic characters and spaces
    return re.sub("[^a-zA-Z\s]", "", text)

# Create a stemmer object
stemmer = SnowballStemmer("english")

def stem_tokens(tokens):
    # Stem the tokens using the Snowball stemmer
    return [stemmer.stem(token) for token in tokens]

def tokenize(text):
    # Tokenize the text and stem the tokens
    tokens = text.split()
    return stem_tokens(tokens)

# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(preprocessor=alpha_only, tokenizer=tokenize, stop_words='english')

# Convert the training and test set to a matrix of TF-IDF features
# Fit and transform the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['tweet'])
# Transform is used to ensure that the test set is transformed using the same vectorizer as the training set
X_test_tfidf = tfidf_vectorizer.transform(test_df['tweet'])