In [5]:
import pandas as pd
import os
import transformers
import torch

file_path = '/content/project_data.csv'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    df = pd.read_csv(file_path)
    print(df.head())

df.pop('user_name')
df.pop('location_name')
df.pop('latitude')
df.pop('longitude')
df.pop('query')

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,  # Adjust as needed
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'  # Return PyTorch tensors
    )

# Tokenize all tweet texts
tokenized_texts = df['text'].apply(tokenize_text)

# Convert tokenized output to tensors
input_ids = torch.cat([tokenized_texts[i]['input_ids'] for i in range(len(tokenized_texts))], dim=0)
attention_masks = torch.cat([tokenized_texts[i]['attention_mask'] for i in range(len(tokenized_texts))], dim=0)

from transformers import BertForSequenceClassification, AdamW, get_scheduler

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prediction function
def predict_ideology(text):
    inputs = tokenize_text(text)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # Map predicted class back to ideology label
    label_to_ideology = {0: 'Liberal', 1: 'Conservative', 2: 'Neutral'}
    predicted_ideology = label_to_ideology[predicted_class]

    return predicted_ideology

# Apply prediction function to each tweet
df['predicted_ideology'] = df['text'].apply(predict_ideology)

# Display dataframe with predicted ideologies
print(df.head())


   Unnamed: 0      tweet_id  \
0           1  1.346800e+18   
1           2  1.346819e+18   
2           3  1.346819e+18   
3           4  1.346818e+18   
4           5  1.346911e+18   

                                                text        query  \
0  right because letting the pandemic run riot wa...  death rates   
1  revjoe great show patriots thank you more pray...        covid   
2  mean while still no riot gear make it make sen...        covid   
3  not prisons though not solitary confinement no...        covid   
4  only a true patriot would travel to washington...       travel   

        user_id user_name  follower_count  user_tweet_count  likes  retweets  \
0  3.676491e+08       NaN              68              6993      6         0   
1  2.400917e+09       NaN            3217             29577      1         0   
2  1.340492e+18       NaN              86               694      0        16   
3  1.059082e+18       NaN            3355              7616      0        17  

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   Unnamed: 0      tweet_id  \
0           1  1.346800e+18   
1           2  1.346819e+18   
2           3  1.346819e+18   
3           4  1.346818e+18   
4           5  1.346911e+18   

                                                text       user_id  \
0  right because letting the pandemic run riot wa...  3.676491e+08   
1  revjoe great show patriots thank you more pray...  2.400917e+09   
2  mean while still no riot gear make it make sen...  1.340492e+18   
3  not prisons though not solitary confinement no...  1.059082e+18   
4  only a true patriot would travel to washington...  1.732348e+07   

   follower_count  user_tweet_count  likes  retweets user_location  \
0              68              6993      6         0           NaN   
1            3217             29577      1         0            CA   
2              86               694      0        16           NaN   
3            3355              7616      0        17            NY   
4             883             38101      1