# From raw twitter scrape json to clean fine tune babyfood jsonl

## snatch tweets from json
reads a file that is a list of dictionaries where the target (tweet) is the value with a key named "text"

In [17]:
import json
 
# Opening JSON file
f = open('twitter_dump2.json')
json_file = json.load(f)

text_file = []

for dictionary in json_file:
    text_file.append(dictionary["text"])
    

## tweet cleaner
if we decide hashtags, emojis or @mentions ae a part of contemporary expressions that belong into poetry we could comment out

In [20]:
import re
import emoji


def tweet_cleaner(tweet):
    # remove mentions
    mentions_free = re.sub("@\w+", "", tweet)
    # remove hashtags
    hashtag_free = re.sub("#\w+", "", mentions_free)
    # special shortcuts
    rt_free = re.sub("^RT", "", hashtag_free)
    # remove url 
    url_free = re.sub(r'http\S+', '', rt_free)
    # remove emojis
    emo_free = emoji.get_emoji_regexp().sub(u'', url_free)
    # remove white space
    stripped = emo_free.strip()
  
    return stripped

In [44]:
clean_tweets = [tweet_cleaner(x)for x in text_file]
len(clean_tweets)

73837

## unicode
this removes stuff like \xa \n and so on

In [45]:
def unicode_maker(text):
    # creating a unicode string
    text_encode = text.encode(encoding="ascii", errors="ignore")
    text_decode = text_encode.decode()
    clean_text = " ".join([word for word in text_decode.split()])
    return clean_text

uni_tweets = [unicode_maker(x)for x in clean_tweets]
uni_tweets

[': Deployment confirmed, s DART is on its way to redirect an asteroid',
 ': Watch Falcon 9 launch s DART mission humanitys first planetary defense test to redirect an asteroid',
 'About to launch asteroid defense test!',
 'That is the great question',
 'Exactly',
 '',
 ': All systems and weather are looking good for tonights Falcon 9 launch of s DART into an asteroid-intercepting interplane',
 'Exactly (sigh)',
 'Sounds good',
 'It can simulate anywhere, but we need to focus on one region until FSD is out of beta, then expand geographically to have betas in other countries. Canada is not a lot different from US, but different enough.',
 'High-performance code is in C using our own compiler to maximize frame rate. We use Python with custom libraries for fast prototyping, then convert to C for fast execution. New software is run against real-world video &amp; our world simulator to evaluate performance, then QA.',
 'I dont use Binance (tried at one point, but signup was too many hoops t

## remove punctuation, numbers etc.

In [46]:
import string

In [47]:
def number_punct(sentence):
    # remove numbers
    number_free = ''.join(word for word in sentence if not word.isdigit())
    # remove punctuation
    punctuation_free = "".join(
        [i for i in number_free if i not in string.punctuation])
    # lower case
    upper_free = punctuation_free.lower()
    
    return upper_free

number_free = [number_punct(x) for x in uni_tweets]
number_free

In [49]:
## removes tweets with less than two words
more_than_two = [x for x in number_free if (len(x.split())>2)]

## removes duplicates
clean_set = set(more_than_two)
clean_set

## remove empty strings and excess whitespaces
strings become empty when they where just made of mentions, url, hashtags etc.

In [51]:
# remove empty strings
no_empty_strings = [i for i in clean_set if i]
# remove outer whitespace 
stripped = [x.strip() for x in no_empty_strings]
# remove inner whitespace 
one_space =  [re.sub(' +', ' ', x) for x in stripped]
one_space

## Creates a DataFrame with a prompt and completion column

In [197]:
keyword = json_file[0]["segment"]
df_prompt_comp = pd.DataFrame({"prompt": keyword, "completion":one_space})
df_dub_free = df_prompt_comp.drop_duplicates()

##  creates json file that can be used to fine tune a GTP3 model

In [197]:
df_dub_free.to_json("p_comp4.json", orient = "records", lines= True)

# Not Used

creates text file without prompts just clean tweets line by line

In [65]:
one_long_string = ".\n".join(one_space)

with open("file10.txt", "w") as output:
    output.write(one_long_string)

## Tokenize
spits out a list of lists of important words 

In [206]:
import string 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))


def text_to_token(sentence):
    # tokenize
    word_tokens = word_tokenize(sentence) 
    # remove stopwords
    important_words = [w for w in word_tokens if not w in stop_words] 
    
    return important_words

In [207]:
tokens = [text_to_token(s) for s in one_space]
tokens

[['universe',
  'works',
  'computers',
  'use',
  'video',
  'games',
  'learn',
  'function',
  'physical',
  'world'],
 ['seriously',
  'guys',
  'actually',
  'besides',
  'act',
  'outraged',
  'press',
  'know',
  'rarely',
  'see',
  'constituents'],
 ['ive', 'got', 'go', 'back', 'watch', 'parts'],
 ['transformation',
  'rwandas',
  'health',
  'system',
  'roots',
  'simple',
  'powerful',
  'belief',
  'everyone',
  'deserves',
  'shot',
  'healthy',
  'life',
  'matter',
  'income',
  'ethnic',
  'group',
  'call',
  'home'],
 ['sometimes', 'socalled', 'law', 'order', 'timeswellnot', 'much'],
 ['weekwhy', 'dont', 'get', 'town'],
 ['three',
  'days',
  'conference',
  'hosted',
  'collaboration',
  'w',
  'visit',
  'htt'],
 ['dont', 'play', 'allstar', 'game', 'nba', 'wont', 'let'],
 ['whew', 'put', 'foot', 'one'],
 ['watching',
  'oscar',
  'short',
  'subject',
  'docs',
  'films',
  'stands',
  'top',
  'immediately',
  'relevant'],
 ['great',
  'speaking',
  'advisory',
  

## Lemmatize
gets rid of plurals aso. without mutliating words like stemming.

In [208]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

def get_lemm(l):
    lemm = [lemma.lemmatize(word) for word in l]
    return lemm

In [209]:
lemmas = [get_lemm(s) for s in tokens]
lemmas

[['universe',
  'work',
  'computer',
  'use',
  'video',
  'game',
  'learn',
  'function',
  'physical',
  'world'],
 ['seriously',
  'guy',
  'actually',
  'besides',
  'act',
  'outraged',
  'press',
  'know',
  'rarely',
  'see',
  'constituent'],
 ['ive', 'got', 'go', 'back', 'watch', 'part'],
 ['transformation',
  'rwanda',
  'health',
  'system',
  'root',
  'simple',
  'powerful',
  'belief',
  'everyone',
  'deserves',
  'shot',
  'healthy',
  'life',
  'matter',
  'income',
  'ethnic',
  'group',
  'call',
  'home'],
 ['sometimes', 'socalled', 'law', 'order', 'timeswellnot', 'much'],
 ['weekwhy', 'dont', 'get', 'town'],
 ['three',
  'day',
  'conference',
  'hosted',
  'collaboration',
  'w',
  'visit',
  'htt'],
 ['dont', 'play', 'allstar', 'game', 'nba', 'wont', 'let'],
 ['whew', 'put', 'foot', 'one'],
 ['watching',
  'oscar',
  'short',
  'subject',
  'doc',
  'film',
  'stand',
  'top',
  'immediately',
  'relevant'],
 ['great',
  'speaking',
  'advisory',
  'council',
 