In [1]:
import ollama
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score

## Classification Schema Selection

Selection:

binary1: irrelevant vs. explicit/implicit/relchanges/releconomy

binary2: relchanges/releconomy/irrelevant vs. explicit/implicit

binary3: releconomy/irrelevant vs. explicit/implicit/relchanges

three_code1: irrelevant vs. implicit/relchanges/releconomy vs. explicit

three_code2: irrelevant vs. relchanges/releconomy vs. explicit/implicit

three_code3: releconomy/irrelevant vs. relchanges vs. explicit/implicit

four_code: irrelevant vs. relchanges/releconomy	 vs. explicit vs. implicit

four_code2: irrelevant vs. relchanges vs. explicit/implicit vs. releconomy

five_code: irrelevant vs. relchanges vs. explicit vs. releconomy vs. implicit
    

In [4]:
corpus = pd.read_json('../data/old_data_clean.json')

def map_values(value):
    if value == 'irrelevant':
        return '5'
    elif value == 'relchanges':
        return '3'
    elif value == 'explicit':
        return '1'
    elif value == 'releconomy':
        return '4'
    elif value == 'implicit':
        return '2'
    else:
        return 'Unknown'

corpus['label'] = corpus['five_code'].apply(map_values)
rel_corpus = corpus[corpus['label'] != '5'].reset_index(drop=True)

In [None]:
max_char = 8000 * 4

output_list = []

meta_statement = """We categorize articles that are related to issues of income inequality, changes in income or wealth, general economic conditions."""

inequality_definition = """Criteria for referencing economic inequality in the United States:
(1) Wage, earnings, pay, and income inequality.
(2) Causes and policy solutions related to economic inequality.
(3) Relational or comparative language among different social class groups.
"""

prompt = "Does the article reference American economic inequality? Respond with 'Yes' if article meets any or all criteria referencing American economic inequality and 'No' if article meets none of the criteria, and explain why in 1 sentence."

for num in range(len(rel_corpus)):
    text = 'Title: ' + rel_corpus.loc[num,'title'] + '\n' + rel_corpus.loc[num,'text'][:max_char]
    print(num)
    print(text[:20])

    response = ollama.chat(
        model="llama3:70b",
        messages=[
            {
                "role": "system",
                "content": "You are a news classifier."
            },
            {
                "role": "user",
                "content": meta_statement,
            },

            {
                "role": "user",
                "content": "Read this definition: " + inequality_definition,
            },

            {
                "role": "user",
                "content": "Read this article: " + text,
            },
            {
                "role": "user",
                "content": prompt

            },
        ],
    options={
    "seed": 101,
    "temperature": 0,
    "num_ctx": 8020,
    "num_thread": 20,
    #"num_predict": 10,
      }
    )
    output_list.append((response["message"]["content"]))

In [None]:
rel_corpus['content'] = output_list

rel_corpus['gen_code'] = rel_corpus['content']
rel_corpus.loc[rel_corpus['content'].str.lower().str.startswith('no'), 'gen_code'] = 0
rel_corpus.loc[rel_corpus['content'].str.lower().str.startswith('yes'), 'gen_code'] = 1

rel_corpus.loc[rel_corpus['content'].str.lower().str.startswith('i have read the article'), 'gen_code'] = 1
rel_corpus.loc[rel_corpus['content'].str.lower().str.startswith('based on the provided article'), 'gen_code'] = 1
rel_corpus.loc[rel_corpus['content'].str.lower().str.startswith('here are my responses'), 'gen_code'] = 1

rel_corpus.loc[rel_corpus['llama_code'].str.len()>3, 'gen_code'] = 0

columns = ['title', 'year', 'month', 'journal', 'code', 'weight', 'code_label', 'id', 'code', 'content', 'gen_code']
rel_corpus = rel_corpus[columns]

In [24]:
rel_corpus.to_csv('../data/inequality_dataset_llama3-70b-Inequality-ResearcherDefinition.csv')