In [None]:
from transformers import pipeline, GPT2Tokenizer,GPT2LMHeadModel
import torch
import numpy as np
import pandas as pd
import random
import pandas
import zipfile
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from io import BytesIO
import requests

# Download City Data
url = 'https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.77.zip'
csv_file_name = 'worldcities.csv'
response = requests.get(url)
if response.status_code == 200:
    with zipfile.ZipFile(BytesIO(response.content)) as z:
        if csv_file_name in z.namelist():
            with z.open(csv_file_name) as csv_file:
                df = pd.read_csv(csv_file)
        else:
            print("CSV file not found in the ZIP archive.")
else:
    print("Failed to download the ZIP file.")

# Filter And Sample World Cities Data
df = df[(df.population>(10**5))]
df.drop_duplicates(subset = ["city"],inplace = True, keep = False)
cities = set(df.city.sample(n = 1000))

# Calculates Token Length Adjusted Probability
def sequence_probability(text):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)
    probs = torch.gather(probs, 2, input_ids.unsqueeze(-1)).squeeze(-1)
    return(float(torch.exp(torch.mean(torch.log(probs)))))

# Prepare And Evaluates Prompts
text = "{city} is a city that I've visited. It has a population of {population} people."
temps = []
temp_cities = {}
pops = [10**5,5*10**5,10**6,2*10**6,5*10**6,10*10**6,25*10**6]
for city in cities:
    probs = []
    for pop in pops:
        tt = text.format(population = str(pop),city = city)
        probs.append(sequence_probability(tt))
    temp_cities[city] = probs
frame = pd.DataFrame(temp_cities)

#frame.to_csv(Stored_Research_Prompt_Frame.csv)

In [None]:
# Normalizes And Reduce Dimensionality Of Probability Outputs
frame_norm = frame/frame.sum(axis = "rows")
X = frame_norm.sub(frame_norm.mean(axis = 1), axis = 0)

# PCA To Isolate 1st Princomp
pca = PCA(n_components=3)
pca.fit(X.T)
first_princomp = pca.components_[0]

# Takes Inner Product Of First Princomp And Data
# A Monotonically Increasing Array Also Works Fairly Well
# first_princomp = np.Array([0,1,2,3,4,5,6])

inner_products = {}
for column in X.columns:
    inner_products[column] = np.dot(first_princomp, X[column])

In [None]:
# Run And Evaluate Regression

# Prep Data
df.loc[:,"Guess"] = df.city.map(inner_products).values
df = df[~df.Guess.isna()]
df.set_index("city",inplace = True)
df.sort_index(inplace = True)
y = np.log(df.population)
X = pd.DataFrame(df.Guess)

# Run And Cross Validate
model = LinearRegression()
scores = cross_val_score(model, X, y, cv=4, scoring='r2')
r2 = np.mean(scores)
print("Mean R2:", r2)
print("Individual Scores:")
print(scores)