Import dependencies

In [None]:
%pip install transformers torch ijson
%pip install openai
%pip install gdown

#Download koo post data from drive and extract it to local directory

In [None]:
import gdown
import os
import zipfile
def download_file(source, download_dir, )-> str:
    '''
    Fetch data from the url
    '''

    try:
        dataset_url = source
        zip_download_dir = download_dir
        os.makedirs("artifacts/data_ingestion", exist_ok=True)


        file_id = dataset_url.split("/")[-2]
        prefix = 'https://drive.google.com/uc?/export=download&id='
        gdown.download(prefix+file_id,zip_download_dir)

    except Exception as e:
        raise e


def extract_zip_file(unzip_dir, data_file):
    """
    zip_file_path: str
    Extracts the zip file into the data directory
    Function returns None
    """
    unzip_path = unzip_dir
    os.makedirs(unzip_path, exist_ok=True)
    with zipfile.ZipFile(data_file, 'r') as zip_ref:
        zip_ref.extractall(unzip_path)

#Function to download the labelled data

In [None]:
from google.colab import files

def download_json():

  # Download the file to your local machine
  files.download('labelled_data.json')

The following function prompts the gpt-4o mini model to classify each post into political/non-political. For the political posts it then determines their political alignment, what the topic of the post is and the overall sentiment of the post.

The topics are limited to the following:

  1. Government Schemes and Initiatives
  2. Political Campaigns and Rallies
  3. Statements and Speeches by Politicians
  4. Social and Cultural Issues
  5. Economic Policies and Reforms
  6. Public Reactions and Opinions
  7. Development Projects and Infrastructure
  8. Law and Order
  9. Environmental Policies and Issues
  10. Health and Education Policies

The alignment of the post is determined through the following criteria:
"Left-Wing Politics: Communist Party of India (CPI), Communist Party of India (Marxist) (CPI(M)), Communist Party of India (Marxist–Leninist) Liberation (CPI(ML) Liberation), All India Forward Bloc (AIFB), Revolutionary Socialist Party (RSP)

Centrist Politics: Indian National Congress (INC), Nationalist Congress Party (NCP), Aam Aadmi Party (AAP), Biju Janata Dal (BJD), Telangana Rashtra Samithi (TRS), Dravida Munnetra Kazhagam (DMK), Yuvajana Sramika Rythu Congress Party (YSRCP)

Right-Wing Politics: Bharatiya Janata Party (BJP), Shiv Sena, Akhil Bharatiya Hindu Mahasabha, Rashtriya Swayamsevak Sangh (RSS) (Note: RSS is not a political party but has significant influence on the BJP), Shiromani Akali Dal (SAD)"



In [None]:
import requests

def classify_post(post, API_KEY, API_URL):
    # Create the prompt for the classification
    prompt = f"""
Classify the following post, identify its political alignment, topic, and analyze its sentiment. The post is considered related to politics if it mentions political parties, their members, their performance, or their policies, with politicians limited to those in the Lok Sabha (Lower house of Indian Parliament). If related to politics, classify the political alignment (Right-Wing Politics, Left-Wing Politics, Centrist Politics) in India according to the following classification:

Left-Wing Politics: Communist Party of India (CPI), Communist Party of India (Marxist) (CPI(M)), Communist Party of India (Marxist–Leninist) Liberation (CPI(ML) Liberation), All India Forward Bloc (AIFB), Revolutionary Socialist Party (RSP)
Centrist Politics: Indian National Congress (INC), Nationalist Congress Party (NCP), Aam Aadmi Party (AAP), Biju Janata Dal (BJD), Telangana Rashtra Samithi (TRS), Dravida Munnetra Kazhagam (DMK), Yuvajana Sramika Rythu Congress Party (YSRCP)
Right-Wing Politics: Bharatiya Janata Party (BJP), Shiv Sena, Akhil Bharatiya Hindu Mahasabha, Rashtriya Swayamsevak Sangh (RSS) (Note: RSS is not a political party but has significant influence on the BJP), Shiromani Akali Dal (SAD)

Post: {post['title']}

1. Classification (Related to Politics/Not Related to Politics):
2. If related to politics, identify the political alignment (Right-Wing Politics/Left-Wing Politics/Centrist Politics):
3. Topic (Choose from the list): Identified Topic:
  Topics:
  1. Government Schemes and Initiatives
  2. Political Campaigns and Rallies
  3. Statements and Speeches by Politicians
  4. Social and Cultural Issues
  5. Economic Policies and Reforms
  6. Public Reactions and Opinions
  7. Development Projects and Infrastructure
  8. Law and Order
  9. Environmental Policies and Issues
  10. Health and Education Policies

4. Sentiment (Positive/Negative/Neutral):
    """

    # Prepare the request payload
    payload = {
        'model': 'gpt-4o-mini',
        'messages': [{'role': 'user', 'content': prompt}],
        'temperature': 0.7
    }

    # Set the headers
    headers = {
        'Authorization': f'Bearer {API_KEY}',
        'Content-Type': 'application/json'
    }

    # Send the POST request to the OpenAI API
    response = requests.post(API_URL, headers=headers, json=payload)

    # Check the response status and parse the classification
    try:
      if response.status_code == 200:
        response_content = response.json()['choices'][0]['message']['content'].strip()

        # Split the response into lines
        lines = response_content.split('\n')

        # Initialize variables to store the extracted values
        classification = None
        political_alignment = None
        topic = None
        sentiment = None

        # Parse the response content
        for line in lines:
            if 'Classification' in line:
                parts = line.split(':')
                if len(parts) > 1:
                    classification = parts[1].strip()
            elif 'political alignment' in line:
                parts = line.split(':')
                if len(parts) > 1:
                    political_alignment = parts[1].strip()
            elif 'Identified Topic' in line:
                parts = line.split(':')
                if len(parts) > 1:
                    topic = parts[2].strip()
            elif 'Sentiment' in line:
                parts = line.split(':')
                if len(parts) > 1:
                    sentiment = parts[1].strip()

        return classification, political_alignment, topic, sentiment
      else:
          print(f"Error: {response.status_code}, {response.text}")
          return None, None, None, None

    except Exception as e:
      print(f"Error: {e}")
      return None, None, None, None


Extract necessary information from prompt. Only 10000 prompts can be processed at a time due to the usage limit on the api.

In [None]:
import json
import random

# Replace with your actual OpenAI API key
API_KEY = 'INSER YOUR API KEY'
API_URL = 'https://api.openai.com/v1/chat/completions'


with open('/content/artifacts/data_ingestion/out.json', 'r') as file:
    data = json.load(file)


random.shuffle(data)

processed_data = []

processed_titles = set()

count = 0
for post in data:
    if post['title'] in processed_titles:
        continue  # Skip if the post has already been processed

    classification, political_alignment, topic, sentiment = classify_post(post, API_KEY, API_URL)
    if classification:
        post['classification'] = classification
        if classification == 'Related to Politics':
            post['political_alignment'] = political_alignment
            if topic:
                post['topic'] = topic
        else:
            post['political_alignment'] = 'N/A'
        if sentiment:
            post['sentiment'] = sentiment
        processed_data.append(post)
        processed_titles.add(post['title'])
        count += 1
        if count % 100 == 0:
            print(f"Processed {count} posts")
            print(processed_data[-10:])

        if count == 10000:
            break



# Save the processed data to a new JSON file
with open('new_out.json', 'w') as file:
    json.dump(processed_data, file, indent=4)

download_json()

print("Classification complete and saved to new_out.json")
