In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import requests
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix
import re
from tqdm import tqdm
import time
df = pd.read_csv('Data/bbc_news_reasons.csv')
# Drop all rows that contain any missing values
df = df.dropna()
df.reset_index

# Check the DataFrame after dropping rows with missing values
df

Unnamed: 0,text_id,text,original_code,replicated_code,model_code,reason
0,0,Bath faced with Tindall ultimatum\n\nMike Tind...,sport,sport,sport,"This news story is about a rugby player, Mike ..."
1,1,Big guns ease through in San Jose\n\nTop-seede...,sport,sport,sport,"The text is reporting on a tennis tournament, ..."
2,2,Chinese dam firm 'defies Beijing'\n\nThe China...,business,business,politics,This news story is about a Chinese company def...
3,3,Wenger offers Mutu hope\n\nArsenal boss Arsene...,sport,sport,sport,This news story is about a football player who...
4,4,Gerrard happy at Anfield\n\nLiverpool captain ...,sport,sport,sport,The article is about Liverpool captain Steven ...
...,...,...,...,...,...,...
95,95,Who do you think you are?\n\nThe real danger i...,tech,tech,tech,The article discusses the dangers of data thef...
96,96,Koubek suspended after drugs test\n\nStefan Ko...,sport,sport,sport,"The news story is about a tennis player, Stefa..."
97,97,China 'blocks Google news site'\n\nChina has b...,tech,politics,politics,This news story is about China's censorship of...
98,98,Leeds v Saracens (Fri)\n\nHeadingley\n\nFriday...,sport,sport,sport,The text is reporting on a rugby match between...


In [5]:
import pandas as pd
import ollama
import re
import time
from tqdm import tqdm

# Example codebook
CODEBOOK = """
business: Tweets or news articles related to companies, financial markets, economic issues, or industry trends.
entertainment: Tweets or news articles related to movies, music, celebrities, cultural events, or leisure activities.
politics: Tweets or news articles related to government, elections, political parties, policies, or international relations.
sport: Tweets or news articles related to sporting events, athletes, competitions, or athletic activities.
tech: Tweets or news articles related to technology advancements, innovation, science, or tech companies.
"""

def prepare_prompt(text):
    system_prompt = f"""
    You are a qualitative coder who is annotating BBC news stories.
    Your task is to assign ONE code from the codebook and explain your reasoning.
    
    YOU MUST INCLUDE BOTH A CODE AND A REASON IN YOUR RESPONSE.
    
    REQUIRED FORMAT:
    Code: [single word category]
    Reason: [at least one sentence explaining why you chose this code]
    
    Codebook:
    ---
    {CODEBOOK}
    ---
    Text:
    ---
    {text}
    ---
    """
    return system_prompt

def classify_text(text, retry_count=3, retry_delay=1):
    """Classify text using local Ollama instance with retry logic."""
    for attempt in range(retry_count):
        try:
            # Generate response using ollama
            response = ollama.generate(
                model='llama3.2:latest',  # Updated to match your installed model
                prompt=prepare_prompt(text),
                stream=False
            )
            
            output = response['response']
            
            if not output:
                raise ValueError("Empty response from model")
            
            print("\nRaw model output:", output)  # Debug output
                
            # Extract code and reasoning using patterns
            code_patterns = [
                r'Code:\s*([a-zA-Z]+)(?=\s*Reason:)',  # Primary pattern
                r'Code:\s*([a-zA-Z]+)',  # Backup pattern
                r'\b(sport|business|politics|entertainment|tech)\b'  # Last resort
            ]
            
            code = "Unknown"
            for pattern in code_patterns:
                match = re.search(pattern, output, re.IGNORECASE)
                if match:
                    code = match.group(1).lower().strip()
                    break
            
            reasoning_patterns = [
                r'Reason:\s*([\s\S]+?)(?=\s*(?:Code:|$))',
                r'Reason:\s*([\s\S]+)$'
            ]
            
            reasoning = "No reasoning provided"
            for pattern in reasoning_patterns:
                match = re.search(pattern, output, re.IGNORECASE)
                if match:
                    reasoning = match.group(1).strip()
                    break
            
            if reasoning == "No reasoning provided" and output:
                reasoning = output.strip()
            
            # Force code to 'business' if text contains specific economic keywords
            economic_keywords = ['economy', 'recession', 'economic', 'gdp', 'market']
            if code == "Unknown" and any(keyword in text.lower() for keyword in economic_keywords):
                code = "business"
            
            return code, reasoning
                
        except Exception as e:
            if attempt == retry_count - 1:
                return "Error", f"Processing failed after {retry_count} attempts: {str(e)}"
            time.sleep(retry_delay * (attempt + 1))
            continue

def process_dataframe(df, batch_size=50, save_interval=200):
    """Process the dataframe with progress tracking and periodic saving."""
    df_processed = df.copy()
    
    # Initialize columns for results
    if 'model_prediction' not in df_processed.columns:
        df_processed['model_prediction'] = None
    if 'reasoning' not in df_processed.columns:
        df_processed['reasoning'] = None
    
    unprocessed_mask = df_processed['model_prediction'].isna()
    total_unprocessed = unprocessed_mask.sum()
    
    if total_unprocessed == 0:
        print("All rows have already been processed!")
        return df_processed
    
    print(f"Processing {total_unprocessed} unprocessed rows...")
    
    with tqdm(total=total_unprocessed) as pbar:
        processed_count = 0
        
        for idx in df_processed[unprocessed_mask].index:
            try:
                text = df_processed.loc[idx, 'text']
                code, reasoning = classify_text(text)
                
                df_processed.loc[idx, 'model_prediction'] = code
                df_processed.loc[idx, 'reasoning'] = reasoning
                
                processed_count += 1
                pbar.update(1)
                
                # Save intermediate results
                if processed_count % save_interval == 0:
                    print(f"\nSaving intermediate results after processing {processed_count} rows...")
                    df_processed.to_csv('ollama_intermediate_results.csv', index=False)
                
                # Add delay to prevent overwhelming the model
                if processed_count % batch_size == 0:
                    time.sleep(1)
                    
            except Exception as e:
                print(f"\nError processing row {idx}: {str(e)}")
                continue
    
    # Print summary
    success_count = df_processed['model_prediction'].notna().sum() - df_processed['model_prediction'].eq('Error').sum()
    error_count = df_processed['model_prediction'].eq('Error').sum()
    print(f"\nProcessing complete!")
    print(f"Successfully processed: {success_count} rows")
    print(f"Errors: {error_count} rows")
    
    return df_processed

# Example usage:
if __name__ == "__main__":
    # Load your DataFrame here
    # df = pd.read_csv('your_input_file.csv')
    df_results = process_dataframe(df)
    # df_results.to_csv('ollama_results.csv', index=False)
    print("Ready to process! Load your DataFrame and call process_dataframe()")

Processing 98 unprocessed rows...


  1%|          | 1/98 [00:10<16:19, 10.10s/it]


Raw model output: Code: politics


  2%|▏         | 2/98 [00:14<10:53,  6.81s/it]


Raw model output: Code: sports
Reason: The text primarily focuses on tennis events, including matches between players such as Andy Roddick, Andre Agassi, Jurgen Melzer, Xavier Malisse, Hyung-Taik Lee, Cyril Saulnier, and Tomas Zib, which suggests that it is about sporting activities.


  3%|▎         | 3/98 [00:20<09:59,  6.31s/it]


Raw model output: Code: politics
Reason: The text primarily focuses on government actions and policies related to China's State Environmental Protection Agency (Sepa), the Chinese government's efforts to cool the country's booming economy, and the builder of the Three Gorges Dam's non-compliance with regulations. These themes are indicative of broader political issues and governance in China, making "politics" a suitable category for this text.


  4%|▍         | 4/98 [00:26<09:53,  6.31s/it]


Raw model output: Code: politics
Reason: The text discusses the actions of Chelsea FC in relation to their player Adrian Mutu, particularly the decision to sack him before his case was heard by the Football Association. This includes the club's stance on Mutu's punishment and their public announcement of the sacking, which has sparked criticism from the Professional Footballers' Association. The context suggests that the issue is related to governance and management in a professional sports organization, indicating a strong connection to political or administrative actions within the football industry.


  5%|▌         | 5/98 [00:34<10:25,  6.72s/it]


Raw model output: Code: politics
Reason: The article discusses Liverpool captain Steven Gerrard's desire to stay at Anfield and win trophies with the club, as well as his reported interest from Chelsea. This indicates that Gerrard is being scouted by another football club, which implies a transfer situation involving the government and international relations (in this case, England). Additionally, the conversation between Liverpool boss Rafael Benitez and Gerrard about winning titles and making the squad stronger for him also touches on governance and managerial decisions within the club, further solidifying the classification as politics-related content.


  6%|▌         | 6/98 [00:38<09:20,  6.09s/it]


Raw model output: Code: sport
Reason: The majority of the text is focused on athletic events, such as the Fayetteville Invitational meeting, where teenagers and Olympic athletes competed in various track and field events, including indoor 400m and 3000m races, with times being mentioned for both men's and women's categories.


  7%|▋         | 7/98 [00:46<10:04,  6.64s/it]


Raw model output: Code: politics
Reason: The article discusses the match-winning kick by Gavin Henson in Wales' victory over England, but it also mentions the broader implications of this win on Welsh rugby and international relations. For example, the coach Mike Ruddock talks about people wanting to see Wales back at the top table of world rugby, which suggests that the country's performance has a national and global impact. Additionally, the article mentions upcoming away fixtures against Italy, France, and Scotland, indicating that Wales' success will influence their position in international competitions.


  8%|▊         | 8/98 [00:52<09:18,  6.21s/it]


Raw model output: Code: politics
Reason: The vast majority of the text revolves around the discussion and debate surrounding a bill in Parliament regarding euthanasia laws, specifically the Mental Capacity Bill, which indicates that this is a matter of politics.


  9%|▉         | 9/98 [00:57<08:50,  5.97s/it]


Raw model output: Code: politics
Reason: The article focuses on the reaction of a British Muslim group (the Muslim Council of Britain) to a TV drama series (24) that they claim portrays Islam unfairly, which is a clear example of public criticism and complaint towards a broadcasting decision. The group's involvement with Sky executives, their statement to the media, and their complaint to Ofcom all indicate that the controversy surrounding the series is a politicized issue.


 10%|█         | 10/98 [01:01<08:02,  5.49s/it]


Raw model output: Code: entertainment
Reason: The text mentions Cat Deeley dropping CD:UK, which is a children's music show, and that she has hosted other shows such as Stars In Their Eyes, suggesting that her main profession is in the entertainment industry. Additionally, the mention of winning a Children's Bafta award for best presenter further reinforces this categorization.


 11%|█         | 11/98 [01:05<07:10,  4.95s/it]


Raw model output: Code: politics
Reason: The text focuses on a politician (George Galloway) making an appeal for the release of another individual (Margaret Hassan), who was kidnapped in Iraq, indicating a situation involving government, politics, and international relations.


 12%|█▏        | 12/98 [01:11<07:31,  5.25s/it]


Raw model output: Code: politics
Reason: The article mentions government, elections, political parties, policies, or international relations as it discusses the World Economic Forum (WEF), which meets in Davos, Switzerland, next week, and highlights issues like business commitment to addressing HIV/AIDS, social responsibility, and non-discrimination.


 13%|█▎        | 13/98 [01:16<07:28,  5.27s/it]


Raw model output: Code: tech
Reason: The text discusses various aspects of technology, such as web commerce, broadband, desktop search, and personalization in search engines, indicating a strong focus on technological advancements and innovations in the field.


 14%|█▍        | 14/98 [01:24<08:12,  5.86s/it]


Raw model output: Code: business
Reason: The article mentions Arthur Hailey's bestselling novels exploring various industries, such as hotels, high finance, and the pharmaceutical industry, which suggests that his work is closely tied to the world of business. Additionally, it highlights how his novels were adapted into successful films and television productions, indicating a significant financial impact on his career.


 15%|█▌        | 15/98 [01:29<07:58,  5.77s/it]


Raw model output: Code: sports
Reason: This article is primarily about a running race, specifically the results of the 51st Cross International Zornotza in Amorebieta, Spain, where Benita Johnson won and Kathy Butler and Hayley Yelling finished fourth and fifth. The language used to describe the event, such as "romped to a five-second victory", "clocked a time of 22 minutes 45 seconds", and "will be one of the star attractions" further emphasizes its focus on sports.


 16%|█▋        | 16/98 [01:33<07:02,  5.15s/it]


Raw model output: Code: tech
Reason: The text discusses a graduate student's software development project that translates colors into musical notes, indicating advancements in technology and innovation in the field of computer graphics and programming.


 17%|█▋        | 17/98 [01:38<06:52,  5.09s/it]


Raw model output: Code: politics
Reason: This news story is primarily focused on the corruption scandal within Cameroon's finance ministry and the government's efforts to address it, which falls under the category of politics. The mention of the prime minister, his office, and the government's plans for audits and punishments all point to a political issue, making this the most relevant code.


 18%|█▊        | 18/98 [01:44<07:20,  5.50s/it]


Raw model output: Code: politics
Reason: This news article discusses the decisions made by Wales coach Mike Ruddock regarding the release of international stars for regional Celtic League fixtures, including a discussion about the benefits of rest for the players, opposition from some teams, and quotes from captain Jason Forster and players like Gareth Cooper. The article touches on issues related to player management, team dynamics, and the potential impact on performances in upcoming games, all of which are relevant to the politics surrounding sports management and team decision-making.


 19%|█▉        | 19/98 [01:48<06:40,  5.07s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on a speech by Charles Kennedy, the leader of the Liberal Democrats, where he addresses ethnic minority voters and discusses topics related to government policies, international relations, and human rights, indicating that the article is centered around political issues.


 20%|██        | 20/98 [01:54<06:54,  5.31s/it]


Raw model output: Code: politics
Reason: This annotation is coded as "politics" because the majority of the text revolves around government plans, civil service job cuts, and labor disputes between the TUC and PCS unions, all of which are related to the functioning of public services and governance. The language used by Brendan Barber, the General Secretary of the TUC, also reinforces this interpretation, as he discusses the impact of job cuts on public sector reforms and the government's policies.


 21%|██▏       | 21/98 [02:00<07:05,  5.52s/it]


Raw model output: Code: politics
Reason: This BBC news story focuses on the visit of Pakistani President Gen Musharraf to Britain, including his meetings with Prime Minister Tony Blair and other discussions about terrorism, the war on terror, and international relations. The tone and content of the article suggest a strong political undertone, making "politics" the most suitable code for this annotation.


 22%|██▏       | 22/98 [02:05<06:38,  5.24s/it]


Raw model output: Code: sports
Reason: The text primarily discusses a rugby match between Ireland and the USA, with detailed information about the teams, players, coaches, and strategies involved, indicating that it falls under the category of sports.


 23%|██▎       | 23/98 [02:09<06:20,  5.08s/it]


Raw model output: Code: politics
Reason: The text primarily discusses government policies, decisions made by politicians (e.g. "Minister for Children, Young People and Families Margaret Hodge"), and proposals being put forward to amend the Children Bill, all of which fall under the category of politics.


 24%|██▍       | 24/98 [02:14<06:13,  5.05s/it]


Raw model output: Code: politics
Reason: The majority of the text revolves around Michael Howard's public statements, responses to criticisms, and his political image, which suggests that the story falls under the category of politics.


 26%|██▌       | 25/98 [02:18<05:26,  4.48s/it]


Raw model output: Code: entertainment
Reason: The text is primarily focused on a celebrity, Harry Connick Jr, and his upcoming role in a Broadway show, which suggests that the article falls under the category of entertainment news.


 27%|██▋       | 26/98 [02:21<05:02,  4.20s/it]


Raw model output: Code: business
Reason: The article discusses UK house prices, which is an economic issue related to companies (e.g., Land Registry), financial markets (e.g., prices), and industry trends (e.g., slowdown in housing market).


 28%|██▊       | 27/98 [02:27<05:23,  4.55s/it]


Raw model output: Code: politics
Reason: The majority of the text discusses government actions and policies related to Alitalia's restructuring plan, including the European Commission's investigation into potential state aid, the Italian government's provision of a loan to Alitalia, and EU regulations governing state aid. These topics are all central to understanding the political context surrounding Alitalia's financial struggles and its relationship with the EU.


 29%|██▊       | 28/98 [02:33<05:56,  5.09s/it]


Raw model output: Code: business
Reason: The majority of the article discusses financial transactions and deals between companies, such as Verizon's acquisition of MCI, Qwest's potential bid for MCI, and SBC Communications' purchase of AT&T. This includes details about mergers, acquisitions, market values, debt levels, and regulatory scrutiny related to the telecoms industry, all of which are central to understanding the business landscape.


 30%|██▉       | 29/98 [02:37<05:25,  4.72s/it]


Raw model output: Code: business
Reason: The text primarily discusses economic indicators such as industrial output growth, production rates, job growth, and consumer optimism, which are all relevant to the field of business and financial markets.


 31%|███       | 30/98 [02:42<05:40,  5.00s/it]


Raw model output: Code: politics
Reason: The majority of the text revolves around political events, including the election campaign posters, Labour's response to Tory claims, and the debate over whether the posters were anti-Semitic. The conversation between Alan Milburn and Julian Lewis is centered on the party's strategy and tactics, making it clear that the primary focus is on politics.


 32%|███▏      | 31/98 [02:49<06:08,  5.50s/it]


Raw model output: Code: politics
Reason: The article discusses various political elements, including the power struggle between Philippe Camus and Noel Forgeard, their roles within EADS, and the reactions of French finance minister Herve Gaymard to the infighting. Additionally, the departure of Mr. Camus is tied to his return to the Lagardere group, which is a significant shareholder of EADS, indicating a shift in governance and influence within the company, which is closely related to political dynamics.


 33%|███▎      | 32/98 [02:54<05:52,  5.34s/it]


Raw model output: Code: business
Reason: I chose this code because the text discusses the British Phonographic Industry (BPI) launching a campaign to help independent labels get their music online and benefit from growing trends in downloading music, which is related to financial markets and economic issues. The BPI's efforts are focused on promoting independent releases and assisting its members, indicating that the story is related to companies and industry trends in the music industry.


 34%|███▎      | 33/98 [03:02<06:40,  6.15s/it]


Raw model output: Code: politics
Reason: The majority of the text discusses the re-election of George Bush as US president, the reaction of Tony Blair and other politicians to this event, and the implications for international relations, global terrorism, and climate change, all of which are closely related to political issues. Specifically, the text mentions Mr Blair's congratulations to President Bush, his comments on the need for cooperation with the US to combat terrorism, and the reactions of various British politicians to the election result, indicating a strong focus on politics and international relations.


 35%|███▍      | 34/98 [03:08<06:20,  5.94s/it]


Raw model output: Code: politics
Reason: The text primarily discusses the House of Lords, its reform, and the role of the Speaker, indicating that it falls under the category of politics. Baroness Boothroyd's comments on the need for a clear outcome in reforms, Lord Falconer's discussion on the interaction between the Lords and Commons, and the lord chancellor's current role as House of Lords speaker all point to a political context.


 36%|███▌      | 35/98 [03:14<06:24,  6.11s/it]


Raw model output: Code: entertainment
Reason: This news story is about a poll of the best British rock albums of all time, specifically the top 10 list compiled by Kerrang! magazine readers, which includes artists such as Black Sabbath, Led Zeppelin, and Queen, among others. The language used to describe the list and its contents, including phrases like "amazing" and "incredible homegrown albums", also suggests a focus on music and entertainment.


 37%|███▋      | 36/98 [03:19<06:00,  5.81s/it]


Raw model output: Code: sports
Reason: This article primarily discusses athletic events, including James McIlroy's participation in the Spar European Indoor Championships, his performances, and his goal to win a major title, indicating that it falls under the category of sports.


 38%|███▊      | 37/98 [03:23<05:11,  5.10s/it]


Raw model output: Code: politics
Reason: The text primarily discusses political parties, their strategies in the upcoming election, and the importance of older voters' turnout, indicating that it is a news article related to government and elections.


 39%|███▉      | 38/98 [03:28<05:12,  5.21s/it]


Raw model output: Code: protest
Reason: The article reports on violent clashes, anti-hunt protests, and arrests of hunt supporters suspected of breaking the law, indicating a strong presence of protesters opposing the ban on hunting with dogs.


 40%|███▉      | 39/98 [03:33<05:10,  5.27s/it]


Raw model output: Code: entertainment
Reason: The article discusses music download networks and the legitimacy of file-sharing, which is related to the music industry and cultural events, making it a topic in the entertainment category.


 41%|████      | 40/98 [03:39<05:04,  5.25s/it]


Raw model output: Code: sports
Reason: The majority of the text discusses tennis matches, players, and championships (e.g., "Dubai Championships", "US Open champion", "WTA tournament win", "Svetlana Kuznetsova", "Alicia Molik"), indicating that the primary focus is on sporting events and activities.


 42%|████▏     | 41/98 [04:07<11:38, 12.26s/it]


Raw model output: This text appears to be a collection of online comments from various individuals responding to the 2005 Brit Awards, where Joss Stone won an award in the "Urban" category. The comments express a range of opinions on the use of the term "urban" to describe music genres and artists.

Some common themes among the comments include:

1. Criticism of the term "urban": Many commenters argue that the term is meaningless, oversimplifies complex musical styles, or even racist.
2. Support for Joss Stone: Several people express admiration for her talent and voice, regardless of the category she won in.
3. Disillusionment with awards ceremonies: Some commenters feel that the Brit Awards are arbitrary and politicized, with winners chosen based on marketing rather than artistic merit.
4. Celebration of diversity: A few commenters argue that music genres should be celebrated without labels or categories, and that the term "urban" is a form of tokenism.
5. Personal perspectives: Some

 43%|████▎     | 42/98 [04:14<09:45, 10.46s/it]


Raw model output: Code: politics
Reason: The majority of the article focuses on economic issues, government policies, and the budget, indicating a strong political theme. Chancellor Gordon Brown, Oliver Letwin, and Jonathan Loynes are all politicians or government officials, and the discussion of the golden rule, taxation, and the national coffers further reinforces the political nature of the text.


 44%|████▍     | 43/98 [04:18<07:52,  8.59s/it]


Raw model output: Code: sports
Reason: The text primarily discusses FIFA awards, including the World Player of the Year award and the women's award, which indicates a strong focus on sporting events and athletes. Additionally, terms such as "Fifa", "European Footballer of the Year", and "national team" suggest that the article is centered around sports-related content.


 45%|████▍     | 44/98 [04:22<06:33,  7.28s/it]


Raw model output: Code: politics
Reason: The text discusses Aston Villa's Thomas Hitzlsperger waiting to learn the future of manager David O'Leary and his contract negotiations, which implies that the management of the football club is a matter of political power and decision-making, with O'Leary holding a position of authority over the team.


 46%|████▌     | 45/98 [04:26<05:34,  6.31s/it]


Raw model output: Code: business
Reason: The article discusses Microsoft's launch of MSN Spaces, a blogging service, and its competition with existing services like Google Blogger and AOL Journals, indicating the company's involvement in the financial market (industry trends) or economic issue (the rise of blogging as a web phenomenon).


 47%|████▋     | 46/98 [04:32<05:18,  6.12s/it]


Raw model output: Code: tech
Reason: The text primarily discusses the development and launch of a new screensaver tool by Lycos, which is a technology advancement aimed at combating spam websites. The focus on internet technologies, such as bandwidth bills and data requests, further solidifies this categorization.


 48%|████▊     | 47/98 [04:37<05:03,  5.96s/it]


Raw model output: Code: politics
Reason: The majority of the text focuses on government efforts to overturn a court ruling, including a meeting in 1953 where executives from major tobacco firms allegedly agreed to deny the harmful effects of smoking. This is a clear example of political activity and government attempts to regulate the industry, making "politics" the most relevant code for this annotation.


 49%|████▉     | 48/98 [04:43<05:01,  6.02s/it]


Raw model output: Code: politics
Reason: The text primarily discusses government policies and initiatives related to education and literacy, including targets for improving adult maths and literacy skills, the Get On scheme, and the impact on the economy. These topics are all closely tied to political issues and decision-making, making it a clear fit for the "politics" category.


 50%|█████     | 49/98 [04:49<04:52,  5.98s/it]


Raw model output: Code: business
Reason: The text is primarily focused on companies, financial markets, economic issues, or industry trends, as evident from the discussion about General Motors (GM) and Ford cutting output due to falling car sales, and Toyota's strong results for its redesigned Avalon sedan and gas-electric Prius hybrid mid-size sedan.


 51%|█████     | 50/98 [04:56<04:52,  6.09s/it]


Raw model output: Code: politics
Reason: The article discusses a court case involving Jayanti Amarishi Buhecha, who was found guilty of two trademark offences related to the distribution of pirated Bollywood DVDs, and his subsequent sentencing. Although the specific details of the case involve business and economic issues (such as copyright infringement), the primary focus of the article is on the legal proceedings and the government's involvement in the case, making it a relevant example for the politics code.


 52%|█████▏    | 51/98 [05:03<05:04,  6.49s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on government policies, laws, and announcements related to anti-social behavior, such as the launch of new pilot areas, the expansion of special prosecutors, and the strengthening of Asbo use. Additionally, it mentions the Prime Minister's speech, the Home Office Minister's comments, and the debate about the effectiveness of initiatives to tackle nuisance behavior, all of which indicate a strong political context.


 53%|█████▎    | 52/98 [05:08<04:36,  6.01s/it]


Raw model output: Code: business
Reason: This article is primarily focused on the financial performance of a company, UK Coal, including its reported losses, revenue, and operational changes. The text discusses the company's efforts to address "operating flaws" and improve earnings, as well as its plans for future growth, making it clear that the article is centered around business-related information.


 54%|█████▍    | 53/98 [05:12<04:08,  5.51s/it]


Raw model output: Code: politics
Reason: The text primarily discusses a proposal for a "super union" between two trade unions, the Transport and General Workers Union (TGWU) and Amicus, which would require approval from their executives and membership. This indicates that the topic is related to government and labor policies, making it a suitable fit under the politics code.


 55%|█████▌    | 54/98 [05:18<04:04,  5.55s/it]


Raw model output: Code: business
Reason: The article primarily focuses on Boeing's new aircraft, its capabilities, and its plans to regain market share, which are all related to the company's business operations and financial performance, such as orders, production costs, and profits.


 56%|█████▌    | 55/98 [05:23<03:53,  5.43s/it]


Raw model output: Code: politics
Reason: This text is primarily focused on the diplomatic efforts and agreements made by India, Pakistan, and the UK regarding the Kashmir region, including announcements about bus services, rail links, and talks to reduce nuclear risk. The mention of foreign ministers meeting in Islamabad and the statement from Foreign Secretary Jack Straw also indicates a strong political aspect to the news story.


 57%|█████▋    | 56/98 [05:28<03:40,  5.25s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on economic data and government responses to it, as well as mentions of a government minister (Heizo Takenaka) and an economist from Lehman Brothers (Paul Sheard), indicating a strong presence of political figures discussing economic issues. This suggests that the content is closely related to politics, specifically discussions around economic policy and recession concerns.


 58%|█████▊    | 57/98 [05:32<03:24,  4.99s/it]


Raw model output: Code: politics
Reason: The text is primarily about Tony Blair's statement on David Blunkett's potential return to government, which implies that the events leading up to Blunkett's resignation are related to his role as former home secretary and are being discussed in the context of current Labour Party dynamics.


 59%|█████▉    | 58/98 [05:41<03:58,  5.96s/it]


Raw model output: Code: politics
Reason: This article discusses England's rugby team and their performance in the Six Nations tournament, which includes information about the team's defeats, coach Andy Robinson's criticism of referee Jonathan Kaplan, and the team's potential complaints to the International Rugby Board. While sports news can sometimes touch on political themes (e.g., national pride), the primary focus of this article is on the politics of the rugby team's performance and its impact on their coaches and leaders, making it a clear fit for the "politics" category in the codebook.


 60%|██████    | 59/98 [05:44<03:19,  5.12s/it]


Raw model output: Code: sports
Reason: The article primarily focuses on Martina Hingis's return to competitive tennis, specifically mentioning her upcoming match against Marlene Weingartner at the Volvo Women's Open in Pattaya, Thailand.


 61%|██████    | 60/98 [05:50<03:22,  5.33s/it]


Raw model output: Code: politics
Reason: This code is assigned because the text primarily discusses government-related individuals (Alastair Campbell, Tony Blair), organizations (Downing Street, the government), and events (the general election), which are all central to political discourse. Additionally, the discussion of media relations and the need for a strategy to deal with public interest during the tour suggests that politics is playing a significant role in shaping the Lions' media approach.


 62%|██████▏   | 61/98 [05:55<03:16,  5.30s/it]


Raw model output: Code: politics
Reason: The text primarily discusses government decisions, political parties, and policies, such as the IFA's original decision to eject Newry FC from the Nationwide Irish Cup, the appeal process, and the club's potential legal action against Bangor FC. The language used also implies a level of bureaucratic red tape and administrative processes typical of government actions, further solidifying the code assignment.


 63%|██████▎   | 62/98 [06:02<03:27,  5.78s/it]


Raw model output: Code: sports
Reason: The text is predominantly focused on Irish athletics news, including athlete performances, competition results, and events such as the Olympics, British trials, and the Flora Lite 5K road race, which are all indicative of athletic pursuits.


 64%|██████▍   | 63/98 [06:05<02:54,  4.98s/it]


Raw model output: Code: entertainment
Reason: The text primarily focuses on music-related topics, such as album releases, rankings, songwriting, and artist praise, indicating that it falls under the category of entertainment.


 65%|██████▌   | 64/98 [06:09<02:43,  4.82s/it]


Raw model output: Code: business
Reason: I assigned the code 'business' to this text because it mentions a "distributor" (Toho) and a box office record, which are both indicators of financial performance and market trends. The fact that the film is being released in 50 countries around the world also suggests that there is significant commercial interest in the movie.


 66%|██████▋   | 65/98 [06:13<02:32,  4.61s/it]


Raw model output: Code: politics
Reason: The text mentions a ruling by the European Court of Human Rights, specifically in relation to a libel action taken against two UK activists who were handing out leaflets criticizing McDonald's, indicating an involvement of government and human rights issues.


 67%|██████▋   | 66/98 [06:20<02:48,  5.27s/it]


Raw model output: Code: politics
Reason: The text primarily discusses government actions, such as the Transportation Secretary Norman Mineta ordering an investigation into the travel chaos caused by US Airways and Comair, which indicates that it is related to government oversight and regulatory issues. Additionally, there are mentions of court-imposed pay cuts for ground crews at US Airways, indicating a legal and financial aspect of the airline's bankruptcy protection.


 68%|██████▊   | 67/98 [06:28<03:08,  6.07s/it]


Raw model output: Code: tech
Reason: The text primarily discusses the evolution of malware, viruses, and criminal use of technology in a way that is relevant to the field of technology advancements, innovation, science, or tech companies. The language used, such as "hi-tech thieves", "malicious programs", "defences against such attacks", and "computer crime" suggests a focus on the technical aspects of these issues, indicating that this article falls under the category of tech.


 69%|██████▉   | 68/98 [06:33<02:48,  5.63s/it]


Raw model output: Code: entertainment
Reason: The text primarily focuses on a musical event featuring Duran Duran, including interviews, concert footage, and promotions of their album and single. Although there are mentions of the band members' personal opinions and experiences, the central theme is the upcoming VH1 special and related activities, which falls under the category of entertainment.


 70%|███████   | 69/98 [06:41<03:04,  6.38s/it]


Raw model output: Code: tech
Reason: The majority of the text discusses the development and capabilities of next-generation video games consoles, including their processing power, graphical capabilities, convergence technologies, and potential features such as real-world physics, immersive gameplay, and changes in distribution methods. This indicates a strong focus on technology advancements and innovations related to gaming.


 71%|███████▏  | 70/98 [06:46<02:44,  5.87s/it]


Raw model output: Code: business
Reason: The text is primarily about Tate & Lyle, a company, its CEO Iain Ferguson, and its financial performance, including the return to growth and an increase in share value. The article also mentions the company's industry (sugar production) and its ranking on the FTSE 100 index, making it clearly related to business news and economic issues.


 72%|███████▏  | 71/98 [06:50<02:28,  5.50s/it]


Raw model output: Code: politics
Reason: Many tweets, articles and news segments within this BBC news story are related to government policies, international relations, as well as comments made by contestants like Jackie Stallone and John McCririck regarding elections and public voting, which indicates that the overall topic is quite political.


 73%|███████▎  | 72/98 [06:56<02:24,  5.57s/it]


Raw model output: Code: politics
Reason: The vast majority of the text revolves around political topics, including government plans, election strategies, and the roles of key figures like Gordon Brown and Tony Blair. Specifically, the discussion centers on the direction New Labour should be taking for its next general election bid, with Mr. Brown outlining his vision and Mr. Blair responding to questions about his role in the process.


 74%|███████▍  | 73/98 [07:00<02:07,  5.11s/it]


Raw model output: Code: politics
Reason: The article mentions government (specifically, the European Cup and the Commonwealth Games) and discusses policies related to athletic competitions, indicating that the news is related to sports policy or international relations. This aligns with the "politics" code in the provided codebook.


 76%|███████▌  | 74/98 [07:05<02:06,  5.25s/it]


Raw model output: Code: politics
Reason: The text primarily focuses on government policies, specifically housing plans and environmental issues, with quotes from Deputy Prime Minister John Prescott and Chairman of the Environmental Audit Committee Peter Ainsworth MP. The report's recommendations and criticisms also center around the government's actions, making it a clear example of news coverage related to politics.


 77%|███████▋  | 75/98 [07:10<01:57,  5.09s/it]


Raw model output: Code: business
Reason: The text discusses US interest rates increasing to 2%, which is related to economic issues and financial markets, making it a relevant example of business news.


 78%|███████▊  | 76/98 [07:16<01:57,  5.34s/it]


Raw model output: Code: politics
Reason: The majority of the text discusses the debates between Labour and the Tories regarding immigration policies, with quotes from prominent politicians such as Charles Clarke and Liam Fox. The language used is formal and informative, typical of news reporting in the politics category, and the content revolves around political parties' stances on specific issues.


 79%|███████▊  | 77/98 [07:23<02:01,  5.80s/it]


Raw model output: Code: entertainment
Reason: I chose this code because the majority of the text discusses films, movie stars (e.g., Ben Stiller, Robert De Niro, Jim Carrey), and box office rankings, which are all relevant to the entertainment industry. The article specifically mentions film titles like "Meet The Fockers", "White Noise", "The Aviator", and "Lemony Snicket's A Series of Unfortunate Events", as well as movie takings and rankings, indicating that the content is primarily focused on entertainment news.


 80%|███████▉  | 78/98 [07:28<01:52,  5.63s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on government issues, specifically a security breach involving the US Bank's computer tapes containing account details of more than one million customers who are US federal employees. The story involves members of the US Senate, Senators Schumer and Leahy, and mentions the Secret Service investigating the loss, indicating a high level of political relevance and interest.


 81%|████████  | 79/98 [07:32<01:35,  5.00s/it]


Raw model output: Code: politics
Reason: The article discusses a report by a Commons science and technology committee, which is a body of Members of Parliament, indicating that the topic is related to government and its policies, specifically regarding research councils and skills shortages.


 82%|████████▏ | 80/98 [07:36<01:24,  4.70s/it]


Raw model output: Code: politics
Reason: The article primarily discusses Lib Dem leader Charles Kennedy's comments on Tony Blair's trust among the British people, which indicates that the discussion revolves around politics, government, and political leaders.


 83%|████████▎ | 81/98 [07:42<01:29,  5.28s/it]


Raw model output: Code: politics
Reason: The vast majority of the text revolves around the personal and professional dynamics between Tony Blair and Gordon Brown, specifically their alleged rift and rival claims over credit for government initiatives, which are central to understanding the Labour Party's internal struggles.


 84%|████████▎ | 82/98 [07:51<01:39,  6.20s/it]


Raw model output: Code: politics
Reason: The article mentions several politicians or political entities, such as Usher (who is a celebrity but also had a strong connection to the US government, given his performance at the event), Alicia Keys (who performed alongside several high-ranking politicians and royalty), Justin Timberlake (who had connections with high-ranking officials in previous ceremonies), Nelly (who was on stage with Pharrell Williams, and Pharrell has connections to high-ranking officials).


 85%|████████▍ | 83/98 [08:01<01:52,  7.50s/it]


Raw model output: Code: politics
Reason: This article is primarily focused on the economic and fiscal policies announced by Chancellor Gordon Brown in his Budget speech, but it also includes quotes from politicians such as Michael Howard (Tory leader), Charles Kennedy (Liberal Democrat leader), Alex Salmond (SNP leader), Simon Thomas (Plaid Cymru), Roger Knapman (UK Independence Party leader), and Andrew Marr (BBC political editor). The discussion of these statements suggests that the article is attempting to provide a balanced view of the reaction to the Budget, making it clear that this code is used for articles about politics.


 86%|████████▌ | 84/98 [08:08<01:43,  7.39s/it]


Raw model output: Code: tech
Reason: The article discusses various technological advancements and trends, such as digital video recorders (DVR), broadband, video-on-demand, high-definition TV (HDTV), and digital cameras, which indicate a focus on technology-related topics. Additionally, the mention of companies like BT and NTL trialling faster ADSL technology highlights the tech industry's role in shaping the country's digital landscape.


 87%|████████▋ | 85/98 [08:13<01:26,  6.63s/it]


Raw model output: Code: politics
Reason: The majority of the text is focused on a political issue, specifically President Chavez's agrarian reform programme and the government's actions against private property rights, indicating that this news story falls under the category of "politics".


 88%|████████▊ | 86/98 [08:20<01:19,  6.61s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on government-related issues, such as regulatory concerns, foreign investment, and policy implications, which are all characteristic of the politics code category. Specifically, it mentions Chinese regulators' annoyance with MG Rover's public disclosure of talks with Shanghai Automotive Industry Corp, the National Development and Reform Commission's oversight of foreign investment, and the potential for job losses due to the proposed deal. These elements are all rooted in government policy and regulatory affairs, making them a strong fit for the politics code category.


 89%|████████▉ | 87/98 [08:27<01:14,  6.78s/it]


Raw model output: Code: politics
Reason: Although the article primarily focuses on the Bangkok International Film Festival, it mentions the devastating impact of the Asian tsunami disaster and how the festival's organizers have scaled down the event while still continuing with their original plan. This demonstrates a clear connection to international relations and global events, which falls under the category of "politics".


 90%|████████▉ | 88/98 [08:33<01:04,  6.44s/it]


Raw model output: Code: sport
Reason: The text primarily focuses on tennis, mentioning specific players (Andy Roddick and Tommy Haas), tournaments (SAP Open in San Jose), and their respective matches, indicating that the majority of the content is related to sporting events. Additionally, quotes from Roddick and Haas are discussing their strategies and experiences during the match, which further reinforces the coding as sport-related.


 91%|█████████ | 89/98 [08:38<00:53,  5.99s/it]


Raw model output: Code: politics
Reason: The majority of the text revolves around news about Lord Archer's potential re-entry into the Conservative Party, his past convictions, and reactions from party members, such as Dr. Liam Fox, Lord Tebbit, and Sir Teddy Taylor, which indicates a strong focus on political issues and events.


 92%|█████████▏| 90/98 [08:45<00:50,  6.36s/it]


Raw model output: Code: politics
Reason: The news story discusses the rugby match between Ireland and Argentina, but the majority of the article highlights the skill and strategy employed by both teams, with the winning margin being narrow (100% record in autumn internationals). The discussion also touches on the referee's decision to award a penalty to Argentina, which was disputed by the Irish team. However, there is no mention of government policies, elections, or any political figures involved in the match.


 93%|█████████▎| 91/98 [08:48<00:38,  5.54s/it]


Raw model output: Code: politics
Reason: This article is primarily related to government events, as it mentions the broadcasting rights for the Academy Awards ceremony, which implies a connection between Sky and the BBC regarding public interest programming.


 94%|█████████▍| 92/98 [08:54<00:32,  5.50s/it]


Raw model output: Code: sports
Reason: The text primarily focuses on the Davis Cup tennis tournament, with Carlos Moya's emotional victory over Andy Roddick being the main storyline, and also mentions Rafael Nadal's impressive performance in winning the competition at a young age.


 95%|█████████▍| 93/98 [08:57<00:24,  4.93s/it]


Raw model output: Code: sport
Reason: The text primarily focuses on athletic events (pole vaulting and the 60m dash), athletes (Yelena Isinbayeva, Maurice Greene, and Leonard Scott), competitions, and their performances, which all fall under the category of sports.


 96%|█████████▌| 94/98 [09:07<00:24,  6.22s/it]


Raw model output: Code: politics


 97%|█████████▋| 95/98 [09:10<00:16,  5.35s/it]


Raw model output: Code: sports
Reason: The text primarily discusses Stefan Koubek's suspension from tennis due to a failed drugs test and his planned appeal, which is related to an incident involving a sporting event (the French Open) and another sporting competition (the Davis Cup).


 98%|█████████▊| 96/98 [09:16<00:11,  5.55s/it]


Raw model output: Code: politics
Reason: I assigned the code "politics" because the majority of the text discusses government actions and policies related to censorship, internet control, and freedom of information, all of which fall under the scope of politics. The mention of a "net police force", government-approved content restrictions, and the involvement of Beijing in controlling Google News also indicate that the story revolves around political issues and power dynamics at play.


 99%|█████████▉| 97/98 [09:21<00:05,  5.43s/it]


Raw model output: Code: sport
Reason: This annotation is coded as "sport" because the text primarily discusses a specific sporting event, a rugby match between Leeds and Saracens at Headingley Stadium on February 25th, 2000. The details provided about the teams' lineups, player substitutions, and the game's progress indicate that the text is focused on reporting the sports news aspect of the event.


100%|██████████| 98/98 [09:28<00:00,  5.80s/it]


Raw model output: Code: politics
Reason: The text primarily focuses on government policies, international relations, and security measures, specifically the UN Convention on Seafarers' Identity Documents, its revised version, and the implementation of biometric technology for tracking seafarers. Additionally, the quote from Cynthia Musselman mentions "increasing security on the seas as well as border control and protection," which further emphasizes the political context of the story.

Processing complete!
Successfully processed: 98 rows
Errors: 0 rows
Ready to process! Load your DataFrame and call process_dataframe()





In [6]:
df_results

Unnamed: 0,text_id,text,original_code,replicated_code,model_code,reason,model_prediction,reasoning
0,0,Bath faced with Tindall ultimatum\n\nMike Tind...,sport,sport,sport,"This news story is about a rugby player, Mike ...",politics,The text discusses a specific player (Mike Tin...
1,1,Big guns ease through in San Jose\n\nTop-seede...,sport,sport,sport,"The text is reporting on a tennis tournament, ...",sports,"The text primarily focuses on tennis events, i..."
2,2,Chinese dam firm 'defies Beijing'\n\nThe China...,business,business,politics,This news story is about a Chinese company def...,politics,The text primarily focuses on government actio...
3,3,Wenger offers Mutu hope\n\nArsenal boss Arsene...,sport,sport,sport,This news story is about a football player who...,politics,The text discusses the actions of Chelsea FC i...
4,4,Gerrard happy at Anfield\n\nLiverpool captain ...,sport,sport,sport,The article is about Liverpool captain Steven ...,politics,The article discusses Liverpool captain Steven...
...,...,...,...,...,...,...,...,...
95,95,Who do you think you are?\n\nThe real danger i...,tech,tech,tech,The article discusses the dangers of data thef...,politics,The article discusses government regulation an...
96,96,Koubek suspended after drugs test\n\nStefan Ko...,sport,sport,sport,"The news story is about a tennis player, Stefa...",sports,The text primarily discusses Stefan Koubek's s...
97,97,China 'blocks Google news site'\n\nChina has b...,tech,politics,politics,This news story is about China's censorship of...,politics,"I assigned the code ""politics"" because the maj..."
98,98,Leeds v Saracens (Fri)\n\nHeadingley\n\nFriday...,sport,sport,sport,The text is reporting on a rugby match between...,sport,"This annotation is coded as ""sport"" because th..."


In [8]:
results = df_results

In [11]:
def calculate_similarity_metrics(results_df, ground_truth_col, prediction_col):
    """
    Calculate various similarity metrics between two columns
    """
    # Remove rows where either column has missing values
    valid_rows = results_df[[ground_truth_col, prediction_col]].dropna()
    
    if len(valid_rows) == 0:
        return {
            'accuracy': 0,
            'kappa': 0,
            'matching_cases': 0,
            'total_cases': 0,
            'percentage_match': 0
        }
    
    # Calculate metrics
    accuracy = accuracy_score(valid_rows[ground_truth_col], valid_rows[prediction_col])
    kappa = cohen_kappa_score(valid_rows[ground_truth_col], valid_rows[prediction_col])
    matching_cases = (valid_rows[ground_truth_col] == valid_rows[prediction_col]).sum()
    total_cases = len(valid_rows)
    percentage_match = (matching_cases / total_cases) * 100
    
    return {
        'accuracy': accuracy,
        'kappa': kappa,
        'matching_cases': matching_cases,
        'total_cases': total_cases,
        'percentage_match': percentage_match
    }

def analyze_code_similarities(results_df):
    """
    Analyze similarities between model outputs and human categorizations
    """
    # Comparisons to make
    comparisons = [
        ('original_code', 'model_code'),
        ('original_code', 'model_prediction'),
        ('replicated_code', 'model_code'),
        ('replicated_code', 'model_prediction')
    ]
    
    # Calculate metrics for each comparison
    results_dict = {}
    for truth_col, pred_col in comparisons:
        results_dict[f"{truth_col}_vs_{pred_col}"] = calculate_similarity_metrics(
            results_df, truth_col, pred_col
        )
    
    # Create a summary dataframe
    summary_df = pd.DataFrame({
        'Comparison': [k.replace('_', ' ').title() for k in results_dict.keys()],
        'Accuracy': [v['accuracy'] for v in results_dict.values()],
        'Kappa Score': [v['kappa'] for v in results_dict.values()],
        'Matching Cases': [v['matching_cases'] for v in results_dict.values()],
        'Total Cases': [v['total_cases'] for v in results_dict.values()],
        'Match Percentage': [v['percentage_match'] for v in results_dict.values()]
    })
    
    # Format the percentage column
    summary_df['Match Percentage'] = summary_df['Match Percentage'].round(2).astype(str) + '%'
    
    # Create detailed confusion matrices
    confusion_matrices = {}
    for truth_col, pred_col in comparisons:
        valid_rows = results_df[[truth_col, pred_col]].dropna()
        if len(valid_rows) > 0:
            matrix = confusion_matrix(
                valid_rows[truth_col], 
                valid_rows[pred_col],
                labels=sorted(results_df[truth_col].unique())
            )
            confusion_matrices[f"{truth_col}_vs_{pred_col}"] = matrix
    
    return summary_df, confusion_matrices

# Run the analysis with the results dataframe
summary_df, confusion_matrices = analyze_code_similarities(results)

# Display the summary
print("\nSimilarity Analysis Summary:")
print("=" * 100)
print(summary_df.to_string(index=False))
print("\nDetailed Analysis:")
print("=" * 100)

# Determine which model is more similar to human coding
original_code_comparison = summary_df[summary_df['Comparison'].str.contains('Original Code')]
replicated_code_comparison = summary_df[summary_df['Comparison'].str.contains('Replicated Code')]

print("\nComparison with Original Code:")
better_for_original = original_code_comparison.iloc[original_code_comparison['Accuracy'].argmax()]
print(f"Better model: {better_for_original['Comparison']}")
print(f"Accuracy: {better_for_original['Accuracy']:.2%}")
print(f"Kappa Score: {better_for_original['Kappa Score']:.3f}")

print("\nComparison with Replicated Code:")
better_for_replicated = replicated_code_comparison.iloc[replicated_code_comparison['Accuracy'].argmax()]
print(f"Better model: {better_for_replicated['Comparison']}")
print(f"Accuracy: {better_for_replicated['Accuracy']:.2%}")
print(f"Kappa Score: {better_for_replicated['Kappa Score']:.3f}")

# Print confusion matrices with labels
for name, matrix in confusion_matrices.items():
    print(f"\nConfusion Matrix for {name}:")
    categories = sorted(results[name.split('_vs_')[0]].unique())
    print("\nCategories:", categories)
    print(matrix)


Similarity Analysis Summary:
                         Comparison  Accuracy  Kappa Score  Matching Cases  Total Cases Match Percentage
        Original Code Vs Model Code  0.877551     0.842823              86           98           87.76%
  Original Code Vs Model Prediction  0.489796     0.358135              48           98           48.98%
      Replicated Code Vs Model Code  0.816327     0.761557              80           98           81.63%
Replicated Code Vs Model Prediction  0.489796     0.337659              48           98           48.98%

Detailed Analysis:

Comparison with Original Code:
Better model: Original Code Vs Model Code
Accuracy: 87.76%
Kappa Score: 0.843

Comparison with Replicated Code:
Better model: Replicated Code Vs Model Code
Accuracy: 81.63%
Kappa Score: 0.762

Confusion Matrix for original_code_vs_model_code:

Categories: ['business', 'entertainment', 'politics', 'sport', 'tech']
[[15  0  5  0  0]
 [ 2 14  1  0  0]
 [ 0  0 25  0  0]
 [ 0  0  2 23  0]
 [ 1  

In [13]:
import pandas as pd
import ollama
import re
import time
from tqdm import tqdm

# Example codebook
CODEBOOK = """
business: Tweets or news articles related to companies, financial markets, economic issues, or industry trends.
entertainment: Tweets or news articles related to movies, music, celebrities, cultural events, or leisure activities.
politics: Tweets or news articles related to government, elections, political parties, policies, or international relations.
sport: Tweets or news articles related to sporting events, athletes, competitions, or athletic activities.
tech: Tweets or news articles related to technology advancements, innovation, science, or tech companies.
"""

# Few-shot examples to help guide the model
EXAMPLES = """
Example 1:
Text: "Manchester United secure dramatic win over Liverpool with last-minute goal from Rashford"
Code: sport
Reason: This is clearly a sports article about a football match between Manchester United and Liverpool, featuring a specific player and match outcome.

Example 2:
Text: "Apple unveils new iPhone with revolutionary AI capabilities"
Code: tech
Reason: The article is about a technology product launch and discusses technological innovations.

Example 3:
Text: "Global stock markets tumble amid inflation fears"
Code: business
Reason: This covers financial markets and economic issues, specifically focusing on stock market performance.

Example 4:
Text: "New legislation passed on climate change policy"
Code: politics
Reason: The article discusses government policy and legislation, which falls under political coverage.
"""

def prepare_prompt(text):
    system_prompt = f"""Instructions: You are an expert news classifier. Your task is to categorize BBC news articles into ONE of the following categories: business, entertainment, politics, sport, or tech.

Rules:
1. ALWAYS choose exactly ONE category
2. Base your decision on the main focus of the article
3. Format your response exactly as shown in the examples
4. Provide clear reasoning that references specific content from the article

Codebook:
{CODEBOOK}

Examples of correct classifications:
{EXAMPLES}

Now analyze this article:
---
{text}
---

Output your analysis in this exact format:
Code: [category]
Reason: [detailed explanation referencing specific content]"""
    return system_prompt

def classify_text(text, retry_count=3, retry_delay=1):
    """Classify text using local Ollama instance with optimized parameters."""
    for attempt in range(retry_count):
        try:
            # Generate response with optimized parameters
            response = ollama.generate(
                model='llama3.2:latest',
                prompt=prepare_prompt(text),
                stream=False,
                options={
                    'temperature': 0.1,  # Lower temperature for more consistent outputs
                    'top_p': 0.9,
                    'top_k': 40,
                    'num_predict': 200,  # Limit response length
                    'stop': ["\n\n", "Text:", "Example"]  # Stop tokens to prevent rambling
                }
            )
            
            output = response['response']
            
            if not output:
                raise ValueError("Empty response from model")
            
            print("\nRaw model output:", output)  # Debug output
                
            # More robust code extraction
            code_patterns = [
                r'Code:\s*(business|entertainment|politics|sport|tech)(?:\s|$)',  # Strict pattern
                r'Code:\s*([a-zA-Z]+)',  # Backup pattern
                r'\b(business|entertainment|politics|sport|tech)\b'  # Last resort
            ]
            
            code = None
            for pattern in code_patterns:
                match = re.search(pattern, output, re.IGNORECASE)
                if match:
                    code = match.group(1).lower().strip()
                    # Validate against allowed categories
                    if code in ['business', 'entertainment', 'politics', 'sport', 'tech']:
                        break
            
            # If no valid code found, use keyword matching
            if not code:
                text_lower = text.lower()
                keywords = {
                    'sport': ['football', 'soccer', 'tennis', 'rugby', 'player', 'match', 'game', 'tournament', 'championship'],
                    'business': ['market', 'economy', 'company', 'stock', 'trade', 'business', 'financial'],
                    'tech': ['technology', 'software', 'computer', 'digital', 'online', 'internet', 'app'],
                    'politics': ['government', 'minister', 'president', 'election', 'policy', 'parliament'],
                    'entertainment': ['movie', 'film', 'music', 'actor', 'star', 'celebrity', 'show']
                }
                
                # Count keyword matches for each category
                scores = {category: sum(1 for word in words if word in text_lower)
                         for category, words in keywords.items()}
                
                # Use category with most keyword matches if any found
                if max(scores.values()) > 0:
                    code = max(scores.items(), key=lambda x: x[1])[0]
                else:
                    code = "Unknown"
            
            # Extract reasoning
            reasoning_pattern = r'Reason:\s*(.*?)(?=(?:Code:|$))'
            reasoning_match = re.search(reasoning_pattern, output, re.IGNORECASE | re.DOTALL)
            reasoning = reasoning_match.group(1).strip() if reasoning_match else output.strip()
            
            # Validate and clean reasoning
            if len(reasoning) < 10:  # If reasoning too short, use raw output
                reasoning = output.strip()
            
            return code, reasoning
                
        except Exception as e:
            if attempt == retry_count - 1:
                return "Error", f"Processing failed after {retry_count} attempts: {str(e)}"
            time.sleep(retry_delay * (attempt + 1))
            continue

def process_dataframe(df, batch_size=50, save_interval=200):
    """Process the dataframe with progress tracking and periodic saving."""
    df_processed = df.copy()
    
    # Initialize columns for results
    if 'model_prediction' not in df_processed.columns:
        df_processed['model_prediction'] = None
    if 'reasoning' not in df_processed.columns:
        df_processed['reasoning'] = None
    
    unprocessed_mask = df_processed['model_prediction'].isna()
    total_unprocessed = unprocessed_mask.sum()
    
    if total_unprocessed == 0:
        print("All rows have already been processed!")
        return df_processed
    
    print(f"Processing {total_unprocessed} unprocessed rows...")
    
    with tqdm(total=total_unprocessed) as pbar:
        processed_count = 0
        
        for idx in df_processed[unprocessed_mask].index:
            try:
                text = df_processed.loc[idx, 'text']
                code, reasoning = classify_text(text)
                
                df_processed.loc[idx, 'model_prediction'] = code
                df_processed.loc[idx, 'reasoning'] = reasoning
                
                processed_count += 1
                pbar.update(1)
                
                # Save intermediate results
                if processed_count % save_interval == 0:
                    print(f"\nSaving intermediate results after processing {processed_count} rows...")
                    df_processed.to_csv('ollama_intermediate_results.csv', index=False)
                
                # Add delay to prevent overwhelming the model
                if processed_count % batch_size == 0:
                    time.sleep(1)
                    
            except Exception as e:
                print(f"\nError processing row {idx}: {str(e)}")
                continue
    
    # Print summary
    success_count = df_processed['model_prediction'].notna().sum() - df_processed['model_prediction'].eq('Error').sum()
    error_count = df_processed['model_prediction'].eq('Error').sum()
    print(f"\nProcessing complete!")
    print(f"Successfully processed: {success_count} rows")
    print(f"Errors: {error_count} rows")
    
    return df_processed

# Example usage:
if __name__ == "__main__":
    df_results2 = process_dataframe(df)
    print("Ready to process! Load your DataFrame and call process_dataframe()")

Processing 98 unprocessed rows...


  1%|          | 1/98 [00:10<16:31, 10.22s/it]


Raw model output: Code: politics


  2%|▏         | 2/98 [00:14<11:03,  6.91s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on tennis matches, featuring the progress of top-seeded players Andy Roddick and Andre Agassi through the SAP Open tournament. The text describes match outcomes, player performances, and quotes from the athletes themselves, which are all characteristic of sports reporting.


  3%|▎         | 3/98 [00:19<09:40,  6.11s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically the Chinese government's order for the China Three Gorges Project Corp to stop construction of one of its dams due to environmental concerns. It also mentions the company's potential fine and denial of violating regulations, indicating a regulatory issue that falls under political coverage.


  4%|▍         | 4/98 [00:24<08:46,  5.60s/it]


Raw model output: Code: politics
Reason: The article discusses the actions of Chelsea FC, specifically their decision to sack striker Adrian Mutu due to a positive drug test result. This is a matter of government policy and legislation in the context of professional sports, as it relates to the player's rights and the club's responsibilities towards its players.


  5%|▌         | 5/98 [00:31<09:11,  5.93s/it]


Raw model output: Code: politics
Reason: The article discusses Steven Gerrard's contract situation and his desire to stay at Liverpool, as well as the club's plans for the future. It also mentions Rafael Benitez's comments on Gerrard's commitment to the team and the potential signing of Fernando Morientes in the January transfer window. These topics are all related to the governance and management of a sports organization (Liverpool FC), which falls under political coverage.


  6%|▌         | 6/98 [00:36<08:51,  5.78s/it]


Raw model output: Code: sport
Reason: This article primarily focuses on athletic events and performances, specifically the indoor 400m and 3000m track and field competitions, featuring notable athletes such as LaShawn Merritt, Bershawn Jackson, Bernard Lagat, and Asafa Powell. The article also mentions Olympic medalists and world record holders, further emphasizing its sports-oriented content.


  7%|▋         | 7/98 [00:43<09:05,  5.99s/it]


Raw model output: Code: sport
Reason: This article primarily focuses on a sports event (Wales vs. England rugby match) and features quotes from the players, coaches, and team dynamics, indicating that it is a sports news article. The language used, such as "match-winning kick," "touch and go," and "spotlight thrust upon him," further reinforces its sporting nature.


  8%|▊         | 8/98 [00:50<09:25,  6.28s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on the debate and bill related to euthanasia laws, specifically discussing government policy, legislation, and the views of various stakeholders such as charities, Christian groups, and politicians. The article also mentions a petition, parliamentary debates, and the involvement of constitutional affairs minister David Lammy, which further solidifies its classification under politics.


  9%|▉         | 9/98 [00:54<08:27,  5.70s/it]


Raw model output: Code: politics
Reason: The article discusses a complaint from a Muslim group to a broadcasting watchdog, Ofcom, regarding the portrayal of Muslims in a TV drama. This falls under political coverage as it involves government regulation and accountability, specifically with regards to broadcasting guidelines and the responsibility of major UK broadcasters to challenge negative stereotypes.


 10%|█         | 10/98 [00:59<07:57,  5.42s/it]


Raw model output: Code: entertainment
Reason: The article is about a TV presenter, Cat Deeley, resigning from her hosting role on the children's music show CD:UK after six years. It discusses her decision to leave and express gratitude to the team and viewers, as well as her future plans for new television projects. This is clearly an entertainment news article focused on celebrity news and television personalities.


 11%|█         | 11/98 [01:05<08:02,  5.54s/it]


Raw model output: Code: politics
Reason: The article discusses a political figure (Ex-Labour MP George Galloway) making an appeal for the release of a hostage (aid worker Margaret Hassan), which falls under government policy and international relations, characteristic of political coverage. Additionally, the article mentions Mr. Galloway's expulsion from the Labour party due to his opposition to the war on Iraq, further solidifying its classification as a politics-related news article.


 12%|█▏        | 12/98 [01:13<09:12,  6.42s/it]


Raw model output: Code: business
Reason: The article discusses companies' lack of preparedness for HIV/AIDS, citing a report by the World Economic Forum, Harvard, and the UN AIDS agency. It highlights the low level of action taken by businesses to address the disease, with only 71% having no policies in place, and notes that some companies, like Anglo-American, have implemented successful strategies to cope with the issue. The article also mentions the economic benefits of addressing HIV/AIDS, such as increased productivity and profitability, which further supports its classification under business news.


 13%|█▎        | 13/98 [01:21<09:52,  6.97s/it]


Raw model output: Code: tech
Reason: This article discusses the evolution of search engines, their role in navigating the internet, and the impact of technology advancements on web commerce. It mentions companies like Google, Microsoft, Yahoo, and Amazon, which are all tech giants, and highlights the importance of user experience, personalization, and targeted advertising in the search industry. The article also touches on the concept of "information overload" and how search engines are trying to improve their performance by getting to know users better.


 14%|█▍        | 14/98 [01:29<09:53,  7.07s/it]


Raw model output: Code: politics
Reason: Although the article primarily focuses on Arthur Hailey's life and career as an author, it mentions his emigration to Canada and his work experience with the Royal Air Force during World War II. These details hint at his involvement in government or military affairs, which is a key aspect of political coverage.


 15%|█▌        | 15/98 [01:33<08:42,  6.30s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on a sporting event, specifically a cross-country race, and features information about the competitors, their performances, and quotes from one of the athletes. The language used, such as "romped to a five-second victory" and "clocked a time of 22 minutes 45 seconds," also suggests a sports-focused article.


 16%|█▋        | 16/98 [01:39<08:29,  6.21s/it]


Raw model output: Code: tech
Reason: The article discusses the development of software that translates colors into musical notes, a technological innovation that falls under the category of tech. Specifically, it highlights the use of computer graphics and programming to create this software, which is a key aspect of technology advancements. Additionally, the article mentions the application of AI capabilities in this software, further solidifying its classification as a tech-related topic.


 17%|█▋        | 17/98 [01:45<08:16,  6.13s/it]


Raw model output: Code: business
Reason: The article discusses financial corruption and its impact on the government's finances, specifically mentioning a loss of 1bn CFA francs ($2m; £1m) per month due to widespread corruption in the finance ministry. It also mentions an investigation into payroll at the ministry and the prime minister's office stating that staff received "unearned salaries", which are related to economic issues and financial markets.


 18%|█▊        | 18/98 [01:50<07:50,  5.88s/it]


Raw model output: Code: politics
Reason: This article discusses the decision-making process of Wales coach Mike Ruddock regarding player releases, specifically focusing on the management of international stars and their availability for regional Celtic League fixtures. The article also mentions the WRU charter and the potential impact on younger players, which suggests a discussion about policy and governance within the rugby union organization.


 19%|█▉        | 19/98 [01:56<07:30,  5.71s/it]


Raw model output: Code: politics
Reason: This article focuses on a political speech by Charles Kennedy, discussing his party's stance on ethnic minority voters, the Iraq war, terrorism, and government plans to hold terror suspects under house arrest. The article also mentions talks between Kennedy and Tony Blair, as well as opposition from Conservative and Liberal Democrat parties in the Lords, which are all key aspects of political coverage.


 20%|██        | 20/98 [02:01<07:09,  5.51s/it]


Raw model output: Code: business
Reason: The article discusses job cuts and their impact on public sector reforms, specifically focusing on the financial implications of reducing civil service jobs. The TUC report highlights that the planned 71,000 job cuts would deliver less than 6% of the £22bn ministers hope to save through efficiency reforms, indicating a business-like analysis of cost savings.


 21%|██▏       | 21/98 [02:07<07:19,  5.70s/it]


Raw model output: Code: politics
Reason: The article discusses government policy, international relations, and political disputes between Pakistan and the UK, as well as the war on terror, Afghanistan, Iraq, and Kashmir. Specifically, it quotes President Musharraf's views on women's rights, terrorism, and extremism, and reports on his meetings with Prime Minister Tony Blair to discuss these issues.


 22%|██▏       | 22/98 [02:14<07:34,  5.98s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on a rugby match between Ireland and the USA, featuring team line-ups, player debuts, and coach quotes. The main focus is on the sporting event itself, with details about the teams' preparations and strategies for the game. While there are mentions of international rankings and previous matches, these are secondary to the core topic of the upcoming rugby match.


 23%|██▎       | 23/98 [02:19<07:08,  5.72s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on the introduction of a national minimum payment for foster carers, which falls under political coverage. The mention of Minister Margaret Hodge, the Children Bill, and the government's plans to amend the bill further solidify this classification.


 24%|██▍       | 24/98 [02:27<07:57,  6.45s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on the response of Tory leader Michael Howard to criticisms about his views on asylum and immigration, as well as his personal image. It includes quotes from various individuals, including Anne Robinson, Alastair Campbell, and Ken Clarke, who discuss Howard's leadership style and perceived shortcomings. The article also mentions a BBC documentary called "No More Mr Nasty" that provides behind-the-scenes access to Howard, further solidifying the political nature of the content.


 26%|██▌       | 25/98 [02:31<07:02,  5.79s/it]


Raw model output: Code: entertainment
Reason: The article focuses on the casting of Harry Connick Jr in a Broadway revival of the musical "The Pajama Game", discussing his role, the show's production team, and its upcoming opening date. This is clearly an entertainment news article about a cultural event, specifically a theatrical performance.


 27%|██▋       | 26/98 [02:35<06:26,  5.36s/it]


Raw model output: Code: business
Reason: The article discusses UK house prices, specifically the annual rate of growth and changes in average house prices, which falls under financial markets and economic issues. It also mentions government figures from the Office of the Deputy Prime Minister and Land Registry data, indicating a focus on industry trends and market performance.


 28%|██▊       | 27/98 [02:41<06:21,  5.37s/it]


Raw model output: Code: politics
Reason: This article is primarily focused on government policy and legislation, specifically an investigation into whether Italian airline Alitalia is receiving illegal state aid from the European Commission. The article discusses the actions of Transport Commissioner Jacques Barrot, the concerns raised by rival airlines, and the EU's rules regarding state aid, all of which are key aspects of political coverage.


 29%|██▊       | 28/98 [02:47<06:35,  5.65s/it]


Raw model output: Code: business
Reason: The article primarily focuses on financial markets, economic issues, and industry trends, specifically discussing a potential bidding war for MCI between Qwest and Verizon. It covers topics such as the value of the deal ($6.75bn), regulatory scrutiny, and the financial implications for both companies, including their debt levels and past settlements with the Securities and Exchange Commission.


 30%|██▉       | 29/98 [02:52<06:13,  5.42s/it]


Raw model output: Code: business
Reason: The article primarily focuses on economic data, specifically industrial production growth and its implications for the US economy. It discusses forecasts, analyst opinions, and expert insights on the state of the economy, job growth, and inflation, which are all key aspects of business news.


 31%|███       | 30/98 [02:58<06:16,  5.53s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on a controversy surrounding Labour's campaign posters and their depiction of Michael Howard. The quotes from Alan Milburn and Julian Lewis, as well as the context of the election and the party's strategy, all point to a political issue that is being debated in the Commons.


 32%|███▏      | 31/98 [03:06<07:06,  6.36s/it]


Raw model output: Code: business
Reason: The article primarily focuses on corporate leadership changes and power struggles within EADS, a European defence and aerospace group. Philippe Camus's departure as French co-head is discussed, along with the appointment of Noel Forgeard as his replacement. Additionally, the article mentions the impact of infighting between Camus and Forgeard on the company's stability and the reactions of key stakeholders, including Herve Gaymard, the French finance minister. The article also touches on the A380 superjumbo project, which is a significant business venture for EADS, highlighting budget concerns and disagreements between Camus and Forgeard.


 33%|███▎      | 32/98 [03:11<06:25,  5.83s/it]


Raw model output: Code: business
Reason: This article focuses on the British Phonographic Industry (BPI) launching a campaign to help independent labels get their music online and benefit from download sales. The BPI is lobbying music service providers, such as iTunes and Napster, to promote independent releases, which indicates a commercial issue related to financial markets and economic issues in the music industry.


 34%|███▎      | 33/98 [03:19<07:02,  6.51s/it]


Raw model output: Code: politics
Reason: This article primarily focuses on political developments, including the re-election of George Bush as US President, reactions from world leaders such as Tony Blair and Michael Howard, and discussions about global issues like terrorism, poverty, and climate change. The article also mentions the upcoming challenges for Mr. Bush's administration, including rebuilding domestic purpose and addressing international relations. While there are some references to specific individuals and events, the overall tone and content of the article suggest a strong focus on political news and analysis.


 35%|███▍      | 34/98 [03:24<06:35,  6.19s/it]


Raw model output: Code: politics
Reason: The article discusses government policy, specifically the reform of the House of Lords, and features quotes from prominent figures such as Baroness Boothroyd and Lord Falconer, who are both involved in the political process. The article also mentions the current role of the lord chancellor as Speaker of the House of Lords, which is a key aspect of the UK's parliamentary system.


 36%|███▌      | 35/98 [03:31<06:39,  6.34s/it]


Raw model output: Code: entertainment
Reason: This article is primarily about music, specifically the top British rock albums of all time as voted by Kerrang! magazine readers. It features interviews with band members, including Ozzy Osbourne, and discusses their discography, which falls under entertainment coverage. The article also includes quotes from the editor and a brief history of the band's formation and success, further solidifying its classification as an entertainment news article.


 37%|███▋      | 36/98 [03:37<06:37,  6.41s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on a sports event, specifically the Spar European Indoor Championships in Madrid, and features a runner, James McIlroy, who is competing for his first major title. The article discusses McIlroy's form, his coach Tony Lester, and his ambitions for the upcoming competition, which are all relevant to the world of athletics.


 38%|███▊      | 37/98 [03:42<05:58,  5.88s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on the upcoming election, voter turnout, and key issues that older voters want the government to address, such as pensions and the NHS. It also quotes a charity boss urging political parties to take action to win over the "grey vote".


 39%|███▉      | 38/98 [03:50<06:22,  6.37s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on the ban on hunting with dogs coming into force. It mentions quotes from politicians such as Theresa May, Simon Hart, and Alastair McWhirter, who share their views on the law and its enforcement. Additionally, the article reports on police actions and arrests related to suspected hunting violations, indicating a strong political focus.


 40%|███▉      | 39/98 [03:59<07:04,  7.20s/it]


Raw model output: Code: entertainment
Reason: Although the article discusses music download networks and advertisers, its main focus is on the controversy surrounding peer-to-peer ads and their impact on the music industry. The article quotes Paul Myers, chief executive of Wippit, and Mark Mulligan, a music analyst with Jupiter Research, discussing the legitimacy of file-sharing networks and the role of advertisers in supporting them. While music is mentioned, the article's primary concern is not entertainment news but rather the business and legal implications of peer-to-peer ads on the music industry.


 41%|████      | 40/98 [04:04<06:21,  6.58s/it]


Raw model output: Code: sport
Reason: This article primarily focuses on a tennis match between Sania Mirza and Svetlana Kuznetsova, discussing the outcome of the match, player performances, and quotes from the players. The article also mentions other tennis matches and players, further emphasizing its sports-oriented content.


 42%|████▏     | 41/98 [04:18<08:30,  8.95s/it]


Raw model output: Code: Misuse of the "Urban" Term
Reason: The term "urban" has been misused to describe black music, implying that it is exclusive to black artists or genres. This label is misleading and ignores the fact that many white bands also create urban music. For example, Oasis, a white rock band from Manchester, characterizes the urban life they know, yet they are not described as "urban" due to their racial background.


 43%|████▎     | 42/98 [04:27<08:10,  8.76s/it]


Raw model output: Code: business
Reason: The article discusses financial markets and economic issues, specifically focusing on the Chancellor's golden rule, borrowing, and taxation. It mentions a £2.1bn boost to the current budget measure, an annual "black hole" of £10.5bn in the nation's coffers, and economists' predictions about fiscal consolidation. The article also quotes the Shadow Chancellor and the Treasury, indicating a focus on economic policy and government finance.


 44%|████▍     | 43/98 [04:32<07:02,  7.68s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on football (soccer) news, specifically discussing the FIFA World Player of the Year award and its shortlisted candidates. The mention of Arsenal's Thierry Henry, Barcelona's Ronaldinho, AC Milan's Andriy Shevchenko, Mia Hamm, Birgit Prinz, and Marta also reinforces this classification, as it highlights football players and teams.


 45%|████▍     | 44/98 [04:36<06:03,  6.73s/it]


Raw model output: Code: politics
Reason: The article discusses the future of Aston Villa manager David O'Leary and his contract negotiations, which involves government-like decisions on staff retention and backroom deals. This falls under political coverage, as it touches on the management of a public institution (a football club) and the implications for its employees and stakeholders.


 46%|████▌     | 45/98 [04:41<05:17,  5.99s/it]


Raw model output: Code: business
Reason: The article discusses Microsoft's launch of MSN Spaces, a blogging service, and its implications for the company's expansion into online publishing. It also mentions competitors like Google and AOL, which offer similar services, indicating an interest in the market trends and competition in the industry.


 47%|████▋     | 46/98 [04:47<05:18,  6.13s/it]


Raw model output: Code: tech
Reason: The article discusses technology advancements, specifically a new screensaver tool developed by Lycos that endlessly requests data from spam websites, aiming to make it more expensive for spammers to operate. This is a clear example of tech-related news, focusing on innovation and its potential impact on the online world.


 48%|████▊     | 47/98 [04:53<05:05,  6.00s/it]


Raw model output: Code: politics
Reason: The article discusses the US government's attempt to overturn a court ruling that threw out its claim for $280bn in damages from tobacco firms, which falls under political coverage. Specifically, it mentions the Justice Department's appeal and the use of legislation to fight organized crime, indicating a focus on government policy and action.


 49%|████▉     | 48/98 [05:02<05:43,  6.87s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on the National Audit Office's report on improving adult literacy skills, the target of 1.5 million adults gaining basic qualifications by 2010, and the government's plans to spend £3.7bn on implementing a programme to address this issue. The article also mentions the Prime Minister Tony Blair's statement that it is "only the start of the journey" and quotes various politicians, including Education minister Ivan Lewis and Shadow Education Secretary Tim Collins, which further emphasizes the political nature of the topic.


 50%|█████     | 49/98 [05:10<05:59,  7.34s/it]


Raw model output: Code: business
Reason: The article primarily focuses on the financial performance of General Motors (GM) and Ford, discussing their production cuts due to falling car sales. It also mentions market trends, such as foreign rivals gaining share in the US market, and quotes from executives explaining the reasons behind these changes. Specifically, the article states that GM and Ford blamed high fuel prices for low sales of big trucks and SUVs, which are profitable vehicles. The article also provides data on sales figures, production cuts, and market shares, all of which are relevant to business news.


 51%|█████     | 50/98 [05:16<05:28,  6.85s/it]


Raw model output: Code: business
Reason: The article discusses a criminal case involving the distribution of pirated DVDs of Bollywood films, specifically focusing on financial gains (£26,000 per month) and the impact on legitimate businesses. It also mentions the involvement of industry organizations (British Phonographic Industry) and their efforts to combat piracy, indicating a commercial or economic issue at play.


 52%|█████▏    | 51/98 [05:24<05:37,  7.18s/it]


Raw model output: Code: politics
Reason: The article discusses government policy, specifically the expansion of anti-social behaviour orders (Asbos) to 50 new pilot areas, the use of special prosecutors and local experts, and the announcement of new measures to strengthen Asbos and fixed penalty notices. It also mentions the Prime Minister's speech and the Home Office Minister's comments on the effectiveness of CSOs, indicating a focus on government initiatives and policies related to law enforcement and community safety.


 53%|█████▎    | 52/98 [05:29<04:57,  6.47s/it]


Raw model output: Code: business
Reason: The article primarily focuses on financial performance and industry trends, specifically discussing UK Coal's losses, share price fluctuations, and operational changes. It also mentions the company's efforts to improve its business through new wage structures, maintenance regimes, and cost-cutting measures, which are all relevant to economic issues and industry developments.


 54%|█████▍    | 53/98 [05:33<04:20,  5.78s/it]


Raw model output: Code: politics
Reason: The article discusses a potential merger between two trade unions, which would require approval from their executives and membership. It also mentions the involvement of Labour's leadership and the Warwick Agreement, indicating that the union is trying to exert more influence with ministers and employers, which falls under political coverage.


 55%|█████▌    | 54/98 [05:39<04:25,  6.04s/it]


Raw model output: Code: business
Reason: The article primarily focuses on Boeing's new aircraft launch and its impact on the airline industry, discussing financial performance, market trends, and competition with Airbus. Specific details such as sales projections, orders from airlines, and profit margins are also mentioned, which further solidify the classification under the business category.


 56%|█████▌    | 55/98 [05:46<04:26,  6.20s/it]


Raw model output: Code: politics
Reason: This article primarily focuses on government policy and international relations, specifically discussing the decision by India and Pakistan to open a bus link across the ceasefire line dividing Kashmir. The article also mentions Foreign Secretary Jack Straw's praise for the "spirit of cooperation" and his hopes that the agreement will make a difference to Kashmiris. Additionally, it mentions talks between the Indian government and Pakistan on reducing the risk of nuclear accidents, which further emphasizes the political nature of the article.


 57%|█████▋    | 56/98 [05:50<03:58,  5.67s/it]


Raw model output: Code: business
Reason: The article discusses Japan's economy and its growth figures, specifically focusing on the country's technical recession, economic recovery, and the impact of a strengthening yen on exports. It also mentions government responses and expert opinions from an economist at Lehman Brothers, which further solidifies its classification as a business news article.


 58%|█████▊    | 57/98 [05:56<03:52,  5.68s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on the resignation of David Blunkett, Tony Blair's comments about his future, and the reactions from Tory Dominic Grieve and Labour backbencher Martin O'Neill. It also mentions the prime minister's "praetorian guard" reference, which is a term typically used in political contexts to describe an elite group of advisors or bodyguards.


 59%|█████▉    | 58/98 [06:04<04:11,  6.29s/it]


Raw model output: Code: sport
Reason: This article primarily focuses on rugby, specifically the Six Nations tournament and England's performance. The quotes from David Campese, a former Australian wing, discuss the team's behavior, leadership issues, and potential complaints to the International Rugby Board. While there are mentions of other teams (Ireland, Wales, Scotland, France) and coaches (Andy Robinson), the core topic is rugby and its related controversies, making it a sports article.


 60%|██████    | 59/98 [06:10<04:01,  6.19s/it]


Raw model output: Code: sport
Reason: This article is about Martina Hingis's return to competitive tennis after a two-year hiatus, specifically focusing on her upcoming match against Marlene Weingartner at the Volvo Women's Open in Pattaya, Thailand. The language used, such as "I always want to win" and "You miss being out there in a Grand Slam final," suggests that the article is primarily about Hingis's athletic performance and competitive spirit, which are key elements of sports reporting.


 61%|██████    | 60/98 [06:16<03:58,  6.28s/it]


Raw model output: Code: politics
Reason: This article discusses government policy and legislation, specifically focusing on the appointment of Alastair Campbell as a media consultant to Sir Clive Woodward's 2005 Lions tour. The article mentions the deterioration in media relations from the previous Lions tour and the need for a strategy to deal with the pressures of the media event. Additionally, it mentions that Campbell will resume working for the government in the new year, indicating a connection to the political sphere.


 62%|██████▏   | 61/98 [06:22<03:44,  6.07s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on a dispute between Newry City FC and the IFA regarding their ejection from the Nationwide Irish Cup. It mentions an appeal pending, a law firm putting forward a case, and the possibility of lodging an injunction in court to stop Bangor playing Portadown, all of which are related to legal implications and government-like actions.


 63%|██████▎   | 62/98 [06:31<04:16,  7.12s/it]


Raw model output: Code: politics
Reason: Although the article discusses various athletes and their performances, its focus on the Irish Athletics Council's sting operation to catch Cathal Lombard for doping, and the subsequent two-year ban, indicates a political aspect. The use of phrases like "the sport was rocked" and "the country attempted to delude itself into believing" also suggests that the article is discussing the impact of this incident on the athletics community, which can be seen as a political issue.


 64%|██████▍   | 63/98 [06:36<03:43,  6.39s/it]


Raw model output: Code: entertainment
Reason: The article discusses a music poll, specifically an album of the year award, featuring various artists such as The Streets, Keane, Franz Ferdinand, and U2. It also includes quotes from Q Magazine praising the albums and their songwriting qualities, indicating that the main focus is on music and cultural events.


 65%|██████▌   | 64/98 [06:42<03:27,  6.09s/it]


Raw model output: Code: entertainment
Reason: This article is primarily about the release of a film, "Howl's Moving Castle", by Oscar-winning animator Hayao Miyazaki. It discusses the film's box office performance in Japan and its upcoming global release in 50 countries, as well as its connection to the author Diana Wynne Jones and its reception from audiences. The article focuses on the entertainment industry, specifically movies and cultural events.


 66%|██████▋   | 65/98 [06:46<03:09,  5.75s/it]


Raw model output: Code: entertainment
Reason: This article is about a film award and its winner, Morgan Spurlock, which falls under entertainment coverage. The article also mentions the film "Super Size Me" and its impact on McDonald's, but the primary focus is on the award ceremony and the filmmaker, making it an entertainment news article.


 67%|██████▋   | 66/98 [06:55<03:34,  6.69s/it]


Raw model output: Code: business
Reason: The article primarily focuses on financial issues related to US Airways' bankruptcy, its impact on passengers and employees, and the airline's struggles with staffing and pay cuts. Specifically, it mentions that US Airways is in Chapter 11 bankruptcy protection for the second time in two years, battling to cut costs, and negotiating pay cuts with flight and baggage staff. The article also discusses the potential financial implications of the chaos on Delta Air Lines, parent of Comair, and the industry's compliance with a 1999 agreement aimed at improving passenger service quality.


 68%|██████▊   | 67/98 [07:05<03:51,  7.48s/it]


Raw model output: Code: tech
Reason: The article discusses the evolution of malware, its increasing sophistication, and the tactics used by malicious programs. It also mentions the use of technology by criminals to commit cybercrime, such as exploiting loopholes in browsers or hiding in e-mail message attachments. Additionally, the article quotes experts on the growing criminal use of malware and how firms are tightening up networks with defences to combat it. The main motivation for these crimes is money, as stated by Gary Stowell, spokesman for St Bernard software, highlighting the financial aspect of cybercrime.


 69%|██████▉   | 68/98 [07:10<03:22,  6.75s/it]


Raw model output: Code: entertainment
Reason: The article primarily focuses on the music industry, specifically a VH1 special featuring Duran Duran, including interviews, concert footage, and fan interactions. It also mentions the band's recent album release, tour dates, and promotional efforts for their new single, indicating that the main emphasis is on entertainment news related to the music world.


 70%|███████   | 69/98 [07:18<03:29,  7.22s/it]


Raw model output: Code: tech
Reason: The article discusses the development of next-generation video game consoles, their capabilities, and the impact on the gaming industry. It mentions technological advancements such as processing power, graphical capabilities, convergence technologies, and real-world physics, which are all related to technology and innovation in the field of gaming.


 71%|███████▏  | 70/98 [07:23<03:00,  6.44s/it]


Raw model output: Code: business
Reason: The article focuses on Tate & Lyle's CEO, Iain Ferguson, being awarded European Businessman of the Year by Forbes, and discusses his role in returning the company to growth and its subsequent stock price increase. This is a clear example of business news, specifically covering companies, financial markets, and economic issues.


 72%|███████▏  | 71/98 [07:28<02:43,  6.06s/it]


Raw model output: Code: entertainment
Reason: This article focuses on celebrity news, specifically the eviction of Jackie Stallone from Celebrity Big Brother, and discusses her interactions with other contestants, as well as reactions from bookmakers and protesters. The tone is also lighthearted and attention-grabbing, typical of entertainment reporting.


 73%|███████▎  | 72/98 [07:33<02:32,  5.87s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on Gordon Brown outlining his vision for New Labour's next general election bid, discussing key themes such as ensuring every child has the best start in life and promoting lifelong education. This is a clear example of political coverage, specifically related to government policy and leadership within the Labour party.


 74%|███████▍  | 73/98 [07:38<02:21,  5.65s/it]


Raw model output: Code: sport
Reason: This article is primarily about the athletic career of Ashia Hansen, a triple jumper, and discusses her injury setback and potential return to competition. The language used, such as "sidelined" and "competitive arena," further reinforces the sports focus. Additionally, the mention of specific events like the European Cup and the Commonwealth Games adds to the sports context.


 76%|███████▌  | 74/98 [07:46<02:26,  6.10s/it]


Raw model output: Code: politics
Reason: The article primarily discusses government policy and legislation related to housing plans, with quotes from Deputy Prime Minister John Prescott and Chairman of the Environmental Audit Committee Peter Ainsworth MP. The report highlights concerns about environmental damage, energy efficiency, and transport, which are all key aspects of government policy. Additionally, the article mentions specific initiatives and proposals, such as a "national spatial framework" for England, which further reinforces its focus on political coverage.


 77%|███████▋  | 75/98 [07:52<02:25,  6.30s/it]


Raw model output: Code: business
Reason: The article primarily focuses on financial markets, economic issues, and industry trends, specifically discussing the US interest rate increase, its impact on the economy, and the Federal Reserve's decision-making process. The article also mentions stock market performance, job creation, exports, and inflation rates, all of which are key aspects of business news.


 78%|███████▊  | 76/98 [07:59<02:22,  6.46s/it]


Raw model output: Code: politics
Reason: The article primarily discusses government policies, specifically the Labour Party's immigration plans and the response from the Conservative Party, led by Tony Blair. It also mentions Home Secretary Charles Clarke and Tory co-chairman Liam Fox, who are both prominent figures in the political landscape. The article focuses on the debate over immigration and asylum policy, which is a key aspect of politics.


 79%|███████▊  | 77/98 [08:05<02:09,  6.15s/it]


Raw model output: Code: entertainment
Reason: The article primarily discusses the performance of various films at the US box office, including their earnings, rankings, and notable achievements (e.g., Jim Carrey's ninth film to hit $100m mark). It also mentions the success of the film "Meet The Fockers" in other countries like Australia and Mexico.


 80%|███████▉  | 78/98 [08:11<02:06,  6.32s/it]


Raw model output: Code: business
Reason: The article primarily focuses on financial issues and data security, specifically discussing the loss of customer account details by Bank of America. It mentions the number of affected customers (over 1 million), the potential for identity theft, and the bank's investigation with federal law authorities. While it does mention some political figures, such as Senators Schumer and Leahy, the main focus is on the financial implications and data security concerns, making "business" the most appropriate category.


 81%|████████  | 79/98 [08:16<01:52,  5.91s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically mentioning a national body designed to tackle skills shortages in key subjects, a report by the Commons science and technology committee, and recommendations for a "strategic capabilities fund" to address shortages. This falls under political coverage, as it involves government actions and policies related to education and workforce development.


 82%|████████▏ | 80/98 [08:23<01:50,  6.14s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on the comments made by Charles Kennedy, the leader of the Liberal Democrats, regarding Tony Blair's trustworthiness as prime minister. The discussion revolves around political issues such as tax plans, anti-terror laws, and immigration, which are all central to the Lib Dem party's stance. Additionally, Mr. Kennedy's statements about the party's policies and his own leadership goals further emphasize the article's focus on politics.


 83%|████████▎ | 81/98 [08:31<01:52,  6.62s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on the alleged rift between Tony Blair and Gordon Brown over who should take credit for the government's global aid and debt initiatives. It also mentions their roles in general election planning, cabinet meetings, and public statements, which are all related to political coverage.


 84%|████████▎ | 82/98 [08:39<01:54,  7.14s/it]


Raw model output: Code: entertainment
Reason: The article primarily focuses on music awards, including the winners of various categories such as best group, best video, best song, and best album. It also mentions performances by artists like OutKast, Usher, Muse, and others, which are all related to the entertainment industry. Additionally, the article quotes artists and musicians, further solidifying its classification as an entertainment news article.


 85%|████████▍ | 83/98 [08:49<01:59,  7.95s/it]


Raw model output: Code: politics
Reason: The article primarily focuses on Gordon Brown's Budget speech, discussing various economic policies, tax cuts, and social welfare measures. It includes quotes from political leaders such as Michael Howard, Charles Kennedy, Alex Salmond, Simon Thomas, Roger Knapman, and the Green Party, which further emphasizes its political nature. Additionally, the article mentions specific government initiatives, such as a memorial to the Queen Mother and plans for same-sex couples, which are also related to politics.


 86%|████████▌ | 84/98 [08:57<01:50,  7.86s/it]


Raw model output: Code: business
Reason: The article discusses the UK's adoption of digital technologies, including broadband speeds, digital TV, and gadgets such as mobiles, cameras, and gaming devices. It also mentions companies like NTL and BT, which are cable and telecommunications providers, and their trials with new broadband technology. Additionally, the article quotes a Jupiter analyst discussing trends in technology adoption across Europe, further emphasizing the business aspect of the report.


 87%|████████▋ | 85/98 [09:03<01:38,  7.56s/it]


Raw model output: Code: business
Reason: The article discusses a dispute over land ownership and seizure by Venezuelan authorities, which falls under financial markets and economic issues. Specifically, it mentions the Vestey Group's subsidiary, Agroflora, operating farms in Venezuela, and the government's agrarian reform programme aimed at seizing idle lands. This indicates that the article is focused on business and economic matters related to companies and their operations in Venezuela.


 88%|████████▊ | 86/98 [09:08<01:19,  6.62s/it]


Raw model output: Code: business
Reason: The article primarily focuses on financial markets and economic issues, specifically discussing a proposed tie-up between MG Rover and Chinese carmaker Shanghai Automotive Industry Corp, its potential impact on the workforce at Rover's Longbridge plant, and concerns about job losses.


 89%|████████▉ | 87/98 [09:15<01:13,  6.70s/it]


Raw model output: Code: entertainment
Reason: This article primarily focuses on the Bangkok International Film Festival, including its events, celebrity guests, and film premieres. The festival's program, attendees, and cultural significance are all discussed in detail, indicating that the main focus is on entertainment news related to movies and the film industry.


 90%|████████▉ | 88/98 [09:20<01:03,  6.31s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on a tennis match, specifically the final of the SAP Open in San Jose, featuring the top seed Andy Roddick and his opponent Cyril Saulnier. The article discusses the match outcome, player quotes, and injuries sustained by one of the players, which are all characteristic of sports reporting.


 91%|█████████ | 89/98 [09:25<00:53,  5.97s/it]


Raw model output: Code: politics
Reason: The article discusses government policy and legislation, specifically focusing on Lord Archer's potential rejoining of the Conservative Party, with quotes from party co-chairman Dr. Liam Fox and other senior figures, including Lord Tebbit and Sir Teddy Taylor, who weigh in on the issue.


 92%|█████████▏| 90/98 [09:31<00:47,  5.88s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on a sports event, specifically an international rugby match between Ireland and Argentina. The text describes the game's progression, player performances, and match outcomes, making it clear that the main focus is on the sporting aspect of the story.


 93%|█████████▎| 91/98 [09:36<00:39,  5.69s/it]


Raw model output: Code: entertainment
Reason: This article is primarily about the broadcasting rights for the Academy Awards (Oscar night), a cultural event that falls under the category of entertainment. The article discusses Sky's new deal to broadcast the event, its coverage plans, and the ceremony's host, Chris Rock, which are all related to the film industry and popular culture.


 94%|█████████▍| 92/98 [09:43<00:36,  6.07s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on a sports event, specifically the Davis Cup tennis tournament, with detailed coverage of the match between Carlos Moya and Andy Roddick, as well as the performances of other players like Rafael Nadal. The language used also suggests a focus on athletic achievement and competition, with quotes from players and coaches discussing strategy and tactics.


 95%|█████████▍| 93/98 [09:48<00:28,  5.61s/it]


Raw model output: Code: sport
Reason: This article focuses on athletic achievements, specifically pole vaulting, and features details about a world record attempt by Yelena Isinbayeva, as well as results from the men's 60m event. The tone and language used also suggest a sports news context, with quotes from athletes and discussion of competition outcomes.


 96%|█████████▌| 94/98 [09:58<00:27,  6.89s/it]


Raw model output: Code: business


 97%|█████████▋| 95/98 [10:02<00:18,  6.09s/it]


Raw model output: Code: sport
Reason: This article is primarily about a tennis player, Stefan Koubek, who has been suspended by the International Tennis Federation (ITF) due to testing positive for a banned substance. The article also mentions his involvement in the Davis Cup and the upcoming Australian Open, further solidifying its focus on sports news.


 98%|█████████▊| 96/98 [10:08<00:12,  6.25s/it]


Raw model output: Code: tech
Reason: The article discusses the blocking of a news site, specifically Google News, and its implications on internet access and censorship. It also mentions Google's response to the issue and the technical aspects of how the news service is curated, such as the use of computer algorithms to select headlines. Additionally, it highlights China's extensive control over the internet, including monitoring websites and e-mails, which suggests a focus on technological and internet-related issues.


 99%|█████████▉| 97/98 [10:13<00:05,  5.71s/it]


Raw model output: Code: sport
Reason: This article is primarily focused on a rugby match between Leeds and Saracens, including team selections, player movements, and game details. The language used, such as "Tykes", "prop", "fly-half", and "pack", also suggests a sports context.


100%|██████████| 98/98 [10:19<00:00,  6.32s/it]


Raw model output: Code: tech
Reason: The article discusses biometric technology and its implementation on a luxury cruise liner, specifically mentioning prototype versions of internationally issued biometric ID cards and machine readers produced by different companies. This focus on technological advancements and innovation in the field of biometrics classifies the article as tech news.

Processing complete!
Successfully processed: 98 rows
Errors: 0 rows
Ready to process! Load your DataFrame and call process_dataframe()





In [14]:
results = df_results2

In [15]:
def calculate_similarity_metrics(results_df, ground_truth_col, prediction_col):
    """
    Calculate various similarity metrics between two columns
    """
    # Remove rows where either column has missing values
    valid_rows = results_df[[ground_truth_col, prediction_col]].dropna()
    
    if len(valid_rows) == 0:
        return {
            'accuracy': 0,
            'kappa': 0,
            'matching_cases': 0,
            'total_cases': 0,
            'percentage_match': 0
        }
    
    # Calculate metrics
    accuracy = accuracy_score(valid_rows[ground_truth_col], valid_rows[prediction_col])
    kappa = cohen_kappa_score(valid_rows[ground_truth_col], valid_rows[prediction_col])
    matching_cases = (valid_rows[ground_truth_col] == valid_rows[prediction_col]).sum()
    total_cases = len(valid_rows)
    percentage_match = (matching_cases / total_cases) * 100
    
    return {
        'accuracy': accuracy,
        'kappa': kappa,
        'matching_cases': matching_cases,
        'total_cases': total_cases,
        'percentage_match': percentage_match
    }

def analyze_code_similarities(results_df):
    """
    Analyze similarities between model outputs and human categorizations
    """
    # Comparisons to make
    comparisons = [
        ('original_code', 'model_code'),
        ('original_code', 'model_prediction'),
        ('replicated_code', 'model_code'),
        ('replicated_code', 'model_prediction')
    ]
    
    # Calculate metrics for each comparison
    results_dict = {}
    for truth_col, pred_col in comparisons:
        results_dict[f"{truth_col}_vs_{pred_col}"] = calculate_similarity_metrics(
            results_df, truth_col, pred_col
        )
    
    # Create a summary dataframe
    summary_df = pd.DataFrame({
        'Comparison': [k.replace('_', ' ').title() for k in results_dict.keys()],
        'Accuracy': [v['accuracy'] for v in results_dict.values()],
        'Kappa Score': [v['kappa'] for v in results_dict.values()],
        'Matching Cases': [v['matching_cases'] for v in results_dict.values()],
        'Total Cases': [v['total_cases'] for v in results_dict.values()],
        'Match Percentage': [v['percentage_match'] for v in results_dict.values()]
    })
    
    # Format the percentage column
    summary_df['Match Percentage'] = summary_df['Match Percentage'].round(2).astype(str) + '%'
    
    # Create detailed confusion matrices
    confusion_matrices = {}
    for truth_col, pred_col in comparisons:
        valid_rows = results_df[[truth_col, pred_col]].dropna()
        if len(valid_rows) > 0:
            matrix = confusion_matrix(
                valid_rows[truth_col], 
                valid_rows[pred_col],
                labels=sorted(results_df[truth_col].unique())
            )
            confusion_matrices[f"{truth_col}_vs_{pred_col}"] = matrix
    
    return summary_df, confusion_matrices

# Run the analysis with the results dataframe
summary_df, confusion_matrices = analyze_code_similarities(results)

# Display the summary
print("\nSimilarity Analysis Summary:")
print("=" * 100)
print(summary_df.to_string(index=False))
print("\nDetailed Analysis:")
print("=" * 100)

# Determine which model is more similar to human coding
original_code_comparison = summary_df[summary_df['Comparison'].str.contains('Original Code')]
replicated_code_comparison = summary_df[summary_df['Comparison'].str.contains('Replicated Code')]

print("\nComparison with Original Code:")
better_for_original = original_code_comparison.iloc[original_code_comparison['Accuracy'].argmax()]
print(f"Better model: {better_for_original['Comparison']}")
print(f"Accuracy: {better_for_original['Accuracy']:.2%}")
print(f"Kappa Score: {better_for_original['Kappa Score']:.3f}")

print("\nComparison with Replicated Code:")
better_for_replicated = replicated_code_comparison.iloc[replicated_code_comparison['Accuracy'].argmax()]
print(f"Better model: {better_for_replicated['Comparison']}")
print(f"Accuracy: {better_for_replicated['Accuracy']:.2%}")
print(f"Kappa Score: {better_for_replicated['Kappa Score']:.3f}")

# Print confusion matrices with labels
for name, matrix in confusion_matrices.items():
    print(f"\nConfusion Matrix for {name}:")
    categories = sorted(results[name.split('_vs_')[0]].unique())
    print("\nCategories:", categories)
    print(matrix)


Similarity Analysis Summary:
                         Comparison  Accuracy  Kappa Score  Matching Cases  Total Cases Match Percentage
        Original Code Vs Model Code  0.877551     0.842823              86           98           87.76%
  Original Code Vs Model Prediction  0.785714     0.725454              77           98           78.57%
      Replicated Code Vs Model Code  0.816327     0.761557              80           98           81.63%
Replicated Code Vs Model Prediction  0.704082     0.616153              69           98           70.41%

Detailed Analysis:

Comparison with Original Code:
Better model: Original Code Vs Model Code
Accuracy: 87.76%
Kappa Score: 0.843

Comparison with Replicated Code:
Better model: Replicated Code Vs Model Code
Accuracy: 81.63%
Kappa Score: 0.762

Confusion Matrix for original_code_vs_model_code:

Categories: ['business', 'entertainment', 'politics', 'sport', 'tech']
[[15  0  5  0  0]
 [ 2 14  1  0  0]
 [ 0  0 25  0  0]
 [ 0  0  2 23  0]
 [ 1  

In [16]:
df_results.to_csv('results_csvs/bbc_new_small_llama_first.csv', index=False)
df_results2.to_csv('results_csvs/bbc_new_small_llama_final.csv', index=False)