In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
!git clone https://github.com/huggingface/transformers
!python transformers/utils/download_glue_data.py

# Now do it with your data

## load data and clean

In [8]:
import pandas as pd
import numpy as np
from langdetect import detect

In [9]:
df_whole = pd.read_csv("data/all_routes_and_desc.csv")
df_whole = df_whole.rename(columns=lambda x: x.strip()) # Removes whitespace around column names
df_whole["words"] = df_whole["desc"] + " " + df_whole["protection"]
df = df_whole[["words", "num_votes", "Avg Stars"]]
df.head()

Unnamed: 0,words,num_votes,Avg Stars
0,This is a really great route~ with awesome exp...,22,2.9
1,from tabvar: Cool fins to roof~ thin holds...,1,2.0
2,A safe mixed route with a bit of run out up to...,3,2.7
3,Start on a slab under a left leaning arched ro...,1,2.0
4,Fun technical climbing. Tricky right off the b...,3,3.0


In [10]:
# Remove rows with no description
bad_df = df[df.words.apply(lambda x: len(str(x))<=5)]
new_df = df[~df.words.isin(bad_df.words)]
print(len(df), len(bad_df), len(new_df), len(df)-len(bad_df)==len(new_df))
df = new_df

116700 352 116348 True


In [11]:
# Remove non-english entries
# takes a few minutes...
def is_english(x):
    try:
        return detect(x)
    except:
        return None

df["english"] = df['words'].apply(lambda x: is_english(x) == 'en')

In [None]:
df = df[df.english]
df = df[["words", "num_votes", "Avg Stars"]]

In [21]:
np.where(df.num_votes <= 9)[0]
df.head()

Unnamed: 0,words,num_votes,Avg Stars
0,This is a really great route~ with awesome exp...,22,2.9
1,from tabvar: Cool fins to roof~ thin holds...,1,2.0
2,A safe mixed route with a bit of run out up to...,3,2.7
3,Start on a slab under a left leaning arched ro...,1,2.0
4,Fun technical climbing. Tricky right off the b...,3,3.0


In [22]:
# Now remove rows with less than 10 votes
few_votes = np.where(df.num_votes <= 9)[0]
for vote in few_votes:
    try:
        df.drop(vote, inplace = True)
    except:
        pass
# df_small = df.drop(few_votes)
# df = df_small
len(df)

31022

In [None]:
df.to_csv('data/words_and_stars_no_ninevotes.csv', index=False, header=True)

## Now, tune DistilBERT with the route data

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig, TFAutoModelWithLMHead, TFAutoModel, AutoModel

import tensorflow as tf
import numpy as np

classifier = pipeline('sentiment-analysis')

In [None]:
df = pd.read_csv('data/words_and_stars_no_ninevotes.csv')
df.replace(4,3.9999999) # prevents errors
df.head()

#### This code is a tester for 3000 examples, just to make sure it runs. 

In [None]:
df_1k = df[:3000]

# normalize star values
df_1k["norm_star"] = df_1k["Avg Stars"]/4
df_1k.replace(1., .9999)

df_1k.norm_star.unique()

model_name = "distilbert-base-uncased"
tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

tf_batch = tokenizer(
     list(df_1k["words"]),
     padding=True,
     truncation=True,
     return_tensors="tf"
 )

tf_outputs = tf_model(tf_batch, labels = tf.constant(list(df_1k["norm_star"]), dtype=tf.float64))

In [None]:
loss = [list(df_1k["norm_star"])[i]-float(tf_outputs[0][i]) for i in range(len(df_1k))]
star_diff = (sum(loss)/3000)
star_diff # off on average by .33 stars

In [None]:
# Save the model
save_directory = "models/route_model"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

#### This code uses all the examples and saves the model

Too expensive: I had to do it in batches

In [None]:
# # normalize star values
# df["norm_star"] = df["Avg Stars"]/2
# df.head()

# # drop null entries
# print(len(np.where(pd.isnull(df["words"]))[0])) # 288 null entries
# df.dropna(inplace = True)

# model_name = "distilbert-base-uncased"
# tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# tf_batch = tokenizer(
#      list(df["words"]),
#      padding=True,
#      truncation=True,
#      return_tensors="tf"
#  )

# tf_outputs = tf_model(tf_batch, labels = tf.constant(list(df["norm_star"]), dtype=tf.float64))

# loss = [list(df["norm_star"])[i]-float(tf_outputs[0][i]) for i in range(len(df))]
# star_diff = (sum(loss)/1000)*4
# star_diff

# # Save the model
# save_directory = "models/route_model"
# tokenizer.save_pretrained(save_directory)
# model.save_pretrained(save_directory)

#### Now, finish tuning by doing 3000 at a time until you have done it all. 

In [None]:
import time
len(df)

In [None]:
chunks = [(3000,6000), (6000, 9000), (9000, 12000), (12000, 15000), (15000, 18000), (18000, 21000), 
          (21000, 24000), (24000, 27000), (27000, len(df))]
losses = [star_diff]

save_directory = "models/route_model"

model_num = 2
tmin = time.time()

for chunk in chunks:
    t0 = time.time()
    
    df_1k = df[chunk[0]:chunk[1]]

    # normalize star values
    df_1k["norm_star"] = df_1k["Avg Stars"]/4
    df_1k.replace(1., .9999)

    # Reload pretrained model
    tokenizer = AutoTokenizer.from_pretrained(save_directory)
    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)

    # Initiate tokenizer
    tf_batch = tokenizer(list(df_1k["words"]), padding=True, truncation=True, return_tensors="tf")
    
    # Get outputs
    tf_outputs = tf_model(tf_batch, labels = tf.constant(list(df_1k["norm_star"]), dtype=tf.float64))
    
    # Calculate loss
    loss = [list(df_1k["norm_star"])[i]-float(tf_outputs[0][i]) for i in range(len(df_1k))]
    star_diff = (sum(loss)/(chunk[1]-chunk[0]))
    losses.append(loss)
    
    # Save the big model
    tokenizer.save_pretrained(save_directory)
    model.save_pretrained(save_directory)
    
    # Save the intermediate model
    inter_save_directory = "models/inter_models/model_" + str(model_num)
    tokenizer.save_pretrained(inter_save_directory)
    model.save_pretrained(inter_save_directory)
    model_num += 1
    
    
    t1 = time.time()
    elap = t1-t0
    
    print("%.2f minutes, or %.2f hours have passed for iteration between %s and %s." %(elap/60, elap/60/60, chunk[0], chunk[1]))
    
tmax = time.time()
print("This code took a total of %.2f minutes, or %.2f hours to run." %((tmax-tmin)/60, (tmax-tmin)/60/60))

In [None]:
chunks = [(6000, 9000), (9000, 12000), (12000, 15000), (15000, 18000), (18000, 21000), 
          (21000, 24000), (24000, 27000), (27000, len(df))]
losses = [star_diff]

model_num = 3
tmin = time.time()

for chunk in chunks:
    t0 = time.time()
    
    df_1k = df[chunk[0]:chunk[1]]

    # normalize star values
    df_1k["norm_star"] = df_1k["Avg Stars"]/4
    df_1k.replace(1., .9999)

    # Reload pretrained model
    tokenizer = AutoTokenizer.from_pretrained(save_directory)
    model = TFAutoModel.from_pretrained(save_directory, from_pt=False)

    # Initiate tokenizer
    tf_batch = tokenizer(list(df_1k["words"]), padding=True, truncation=True, return_tensors="tf")
    
    # Get outputs
    tf_outputs = tf_model(tf_batch, labels = tf.constant(list(df_1k["norm_star"]), dtype=tf.float64))
    
    # Calculate loss
    loss = [list(df_1k["norm_star"])[i]-float(tf_outputs[0][i]) for i in range(len(df_1k))]
    star_diff = (sum(loss)/(chunk[1]-chunk[0]))
    losses.append(loss)
    
    # Save the big model
    save_directory = "models/route_model"
    tokenizer.save_pretrained(save_directory)
    model.save_pretrained(save_directory)
    
    # Save the intermediate model
    inter_save_directory = "models/inter_models/model_" + str(model_num)
    tokenizer.save_pretrained(inter_save_directory)
    model.save_pretrained(inter_save_directory)
    model_num += 1
    
    t1 = time.time()
    elap = t1-t0
    
    print("%.2f minutes, or %.2f hours have passed for iteration between %s and %s." %(elap/60, elap/60/60, chunk[0], chunk[1]))
    
tmax = time.time()
print("This code took a total of %.2f minutes, or %.2f hours to run." %((tmax-tmin)/60, (tmax-tmin)/60/60))

## Now, use this model as the base for the 1,099 gear reviews

In [None]:
gear_df = pd.read_csv("data/trailspace_gear_reviews.csv")
gear_df = gear_df.rename(columns=lambda x: x.strip())
gear_df.head()

In [None]:
# normalize rating
gear_df["norm_rating"] = gear_df["rating"]/5*2
gear_df["norm_rating"] = gear_df["norm_rating"].replace(2., 1.99999)
gear_df.head()

In [None]:
# Reload pretrained model
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = TFAutoModel.from_pretrained(save_directory, from_pt=False)

In [None]:
# Add some more training
tf_batch = tokenizer(
     list(gear_df["rating_text"]),
     padding=True,
     truncation=True,
     return_tensors="tf"
 )

In [None]:
tf_outputs = tf_model(tf_batch, labels = tf.constant(list(gear_df["norm_rating"]), dtype=tf.float64))

In [None]:
loss = [list(gear_df["norm_rating"])[i]-float(tf_outputs[0][i]) for i in range(len(gear_df))]
star_diff = (sum(loss)/len(gear_df))*5
star_diff

In [None]:
# Save the model
save_directory = "models/trailspace_and_route_model"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

## Now, tune a DistillBERT model only on the gear reviews

In [151]:
gear_df = pd.read_csv("data/trailspace_gear_reviews.csv")
gear_df = gear_df.rename(columns=lambda x: x.strip())

# normalize rating
gear_df["norm_rating"] = gear_df["rating"]/5*2
gear_df["norm_rating"] = gear_df["norm_rating"].replace(2., 1.99999)
gear_df.head()

model_name = "distilbert-base-uncased"
tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

tf_batch = tokenizer(
     list(gear_df["rating_text"]),
     padding=True,
     truncation=True,
     return_tensors="tf"
 )

tf_outputs = tf_model(tf_batch, labels = tf.constant(list(gear_df["norm_rating"]), dtype=tf.float64))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_503']
You should probably TRAIN this model on a down-stream task to be able to use 

In [152]:
# Save the model
save_directory = "models/trailspace_model"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)