# Code switched language modeling performance metric comparison

## Data set preparation
### Download raw files
We found some twiter corpuses in this repository: https://github.com/divamgupta/mtl_girnet/tree/master/data_prep/data_cm_senti
- code switched English and Spanish: https://github.com/divamgupta/mtl_girnet/blob/master/data_prep/data_cm_senti/cs-corpus-with-tweets_test.txt
- code switched English and Spanish: https://github.com/divamgupta/mtl_girnet/blob/master/data_prep/data_cm_senti/cs-corpus-with-tweets_train.txt
- Spanish: https://github.com/divamgupta/mtl_girnet/blob/master/data_prep/data_cm_senti/1600_tweets_dev_complete.txt
- https://github.com/divamgupta/mtl_girnet/blob/master/data_prep/data_cm_senti/twitter4242.txt

### Preprocess raw corpuses
- remove prefix and trailing information from tweets
- combine the training and test set of the code switched data
- create three new line seperated lists of tweets:
 - codes_switched_spanish_english_tweets.txt
 - spanish_tweets.txt
 - english_tweets.txt

In [1]:
# read in the tweet data sets
from pprint import pprint


def get_tweets(filename)->list:
    tweets = []
    with open(filename, 'r', encoding='UTF-8') as file:
        while (line := file.readline().rstrip()):
            tweets.append(line)
    return tweets


file_path = '../data/preprocessed_corpuses/'
english_tweets = get_tweets(file_path + 'english_tweets.txt')
spanish_tweets = get_tweets(file_path + 'spanish_tweets.txt')
codes_switched_spanish_english_tweets = get_tweets(file_path + 'code_switched_spanish_english_tweets.txt')

pprint(codes_switched_spanish_english_tweets[:10])


['Waiting for my momma to get home y nada :(',
 'Me dos!!! Y fuck a los que no lea guste lol',
 "& then my mom wouldn't like her cause she's not my boo! & My sister would "
 'rat me out too!!! Segun muy Best Cuñadas ellas! Lol',
 'Only child. Hijo único Person 1: Do you have any brothers or sisters? Person '
 '2: No, I’m the only child.',
 '@rvictoriamiguez hay un par de frases q te pueden servir, La unión hace la '
 'fuerza: There is strength in numbers. Or United we stand :)',
 '@princessstacy_ lmao I know  we suck  Mejor que nos Vengan a Ver Ellos So we '
 "know it's real cx",
 '@AlanTorres8484 well duh thats how it should be..si no k aburrido si nomas '
 'te digo hola..lol',
 'Un dia del año k le ablo and hes busy!! Blah fuck it <<<',
 '@crystal_jaimes ba you sounded lk my sister! ! :D its alright no te enseles '
 'aprende k ay k compartir..lmao',
 'I don’t have cash on me: No tengo efectivo en este momento. I’m sorry I '
 'don’t have cash on me, all I have is my card.']


In [2]:
# pip install openai
# conda install -c conda-forge openai
import os
import openai
import time



In [3]:
# set API key by copying and pasting it here, do not commit the key to source control

open_ai_key = '' # sk-some_long_string

openai.api_key = open_ai_key

In [4]:
from collections import namedtuple


Input = namedtuple('Input', ['original_text', 
                              'prompt', 
                              'start',
                              'stop',
                              'actual_next_word'
                              ])

GPT_response = namedtuple('GPT_response', ['gpt3_response',
                              'predicted_next_word',
                              'predicted_actual_match'])

In [5]:
# set parameters

word_position_to_predict = -3 # many of the english tweets end with a url, we want to predict a word
gpt3_engine = 'davinci'
temperature = 0.9
max_tokens = 10
top_p = 1
frequency_penalty=1
presence_penalty=1
number_of_trials = 100  # this is the number of records to try from each corpus

settings_dict = {
    'word_position_to_predict': word_position_to_predict,
    'gpt3_engine': gpt3_engine,
    'temperature': temperature,
    'max_tokens': max_tokens,
    'top_p': top_p,
    'frequency_penalty': frequency_penalty,
    'presence_penalty': presence_penalty,
    'number_of_trials': number_of_trials
    }

In [6]:
def strip_formatting(word: str) -> str:
    word = word.strip()
    word = word.replace('.', '')
    word = word.replace(',', '')
    word = word.replace('-', '')
    word = word.replace('_', '')
    word = word.replace('!', '')
    word = word.replace('?', '')
    return word

In [7]:
# call GPT3

def gpt3_predict_next_word(gpt3_input: Input) -> GPT_response:

    response = openai.Completion.create(
      engine=gpt3_engine,
      prompt=gpt3_input.prompt,
      temperature=temperature,
      max_tokens=max_tokens,
      top_p=top_p,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      stop=[gpt3_input.start, '\n'],
    )
    
#     print(response)
    
    predicted_next_word = response['choices'][0]['text'].strip().split(' ')[0]
    
    is_match = strip_formatting(gpt3_input.actual_next_word) == strip_formatting(predicted_next_word)
    
    return GPT_response(response, predicted_next_word, is_match)

In [8]:
# iterate over the specified number of records from each language corpus and ask GPT3 to predict the next word

english_responses = []
spanish_responses = []
code_switched_spanish_english_responses = []

language_tweets_to_responses_iteration_list = [
    (english_tweets, english_responses, 'english'),
    (spanish_tweets, spanish_responses, 'spanish'),
    (codes_switched_spanish_english_tweets, code_switched_spanish_english_responses, 'code_switched_english_spanish')
]

for language_tweets_to_responses in language_tweets_to_responses_iteration_list:
    for tweet in language_tweets_to_responses[0][:number_of_trials]:
        tweet_words = tweet.split(' ')
        if len(tweet_words) < abs(word_position_to_predict) + 4:
            print('\n', '-- WARNING --\n','The following input is being skipped because it is too short: \n', tweet, '\n')
            continue
            
        gpt3_input = Input(original_text=tweet,
                          prompt=' '.join(tweet_words[:word_position_to_predict-1]),
                          start=tweet_words[0],
                          stop=tweet_words[word_position_to_predict-1],
                          actual_next_word=tweet_words[word_position_to_predict]
                         )
        gpt3_response = gpt3_predict_next_word(gpt3_input)
        language_tweets_to_responses[1].append((gpt3_input, gpt3_response))
        time.sleep(2)  # wait for 2 seconds between calls to stay below rate limit of 60 calls per minute

    pprint(language_tweets_to_responses[1][:2])

print('done calling GPT3')



 The following input is being skipped because it is too short: 
 I wana see the vid Kyan 


 The following input is being skipped because it is too short: 
 @Mrhilton1985 Welcome to Twitter xx 


 The following input is being skipped because it is too short: 
 #4WordsOnObamasHand Don't Say The N-Word 


 The following input is being skipped because it is too short: 
 Phil Collins- You Can't Hurry Love 


 The following input is being skipped because it is too short: 
 Equipped a Gamma Mittens. #epicpetwars http://www.epicpetwars.com 


 The following input is being skipped because it is too short: 
 The Newest Member of _________________! #yehbuddy 


 The following input is being skipped because it is too short: 
 oh my goodness I'm emo 


 The following input is being skipped because it is too short: 
 RT @katyperry: I ? New York! 

[(Input(original_text='?RT @justinbiebcr: The bigger the better....if you know what I mean ;)', prompt='?RT @justinbiebcr: The bigger the better....if y

[(Input(original_text='Waiting for my momma to get home y nada :(', prompt='Waiting for my momma to get', start='Waiting', stop='home', actual_next_word='y'),
  GPT_response(gpt3_response=<OpenAIObject text_completion id=cmpl-6HLNB26r1TZ20CxEIo2nRp9QESZ9z at 0x7fc2d902e450> JSON: {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": " here,\u201d she said. \u201cShe"
    }
  ],
  "created": 1669591753,
  "id": "cmpl-6HLNB26r1TZ20CxEIo2nRp9QESZ9z",
  "model": "davinci",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 8,
    "total_tokens": 18
  }
}, predicted_next_word='here,”', predicted_actual_match=False)),
 (Input(original_text='Me dos!!! Y fuck a los que no lea guste lol', prompt='Me dos!!! Y fuck a los que', start='Me', stop='no', actual_next_word='lea'),
  GPT_response(gpt3_response=<OpenAIObject text_completion id=cmpl-6HLNDJBG4cW10WghFPENiqeggf9HC at 0x7fc2d902e130> JSON: {
 

In [9]:
# calculate cumulative accuracy

Performance = namedtuple('Performance', ['language', 'total_trials', 'num_correct', 'accuracy'])

language_tweets_to_response_to_accuracy = []

for language_tweets_to_responses in language_tweets_to_responses_iteration_list:
    number_of_responses, number_of_correct_responses = 0,0

    for _input, response in language_tweets_to_responses[1]:
        number_of_responses += 1
        if response.predicted_actual_match:
            number_of_correct_responses += 1
    
    accuracy = number_of_correct_responses / number_of_responses
    
    language_tweets_to_response_to_accuracy.append(
        (language_tweets_to_responses[0], 
         language_tweets_to_responses[1], 
         Performance(language_tweets_to_responses[2], number_of_responses, number_of_correct_responses, accuracy)))

for results in language_tweets_to_response_to_accuracy:
    print(results[2])
    
    
    

Performance(language='english', total_trials=92, num_correct=1, accuracy=0.010869565217391304)
Performance(language='spanish', total_trials=77, num_correct=1, accuracy=0.012987012987012988)
Performance(language='code_switched_english_spanish', total_trials=87, num_correct=0, accuracy=0.0)


In [10]:
# save results to disk

import json


now = str(int(time.time()))
print('now:', now)


def create_folder_(path):
    if not os.path.exists(path):
        os.makedirs(path)
        

results_path = '../results/' + now
create_folder_(results_path)

with open(results_path + '/standard_perfromance.json', 'w', encoding='utf-8') as f:
    json.dump(language_tweets_to_response_to_accuracy, f, ensure_ascii=False, indent=4)
    

with open(results_path + '/settings.json', 'w', encoding='utf-8') as f:
    json.dump(settings_dict, f, ensure_ascii=False, indent=4)
    
print('settings and standard performance results saved')

now: 1669591982
settings and standard performance results saved


In [13]:
# set the folder to load, the 'now' from the standard results that we would like to use
standard_results_folder = ''  # such as: 1669591982

In [14]:
standard_results = dict()

if standard_results_folder:
    standard_results_path = '../results/' + standard_results_folder + '/standard_perfromance.json'
    with open(standard_results_path) as json_file:
        standard_results = json.load(json_file)

# pprint(standard_results)

if standard_results:
    print('Standard results loaded from run', standard_results_folder)
else:
     print('Standard results not loaded')

Standard results not loaded
