In [91]:
# import libraries 
import pandas as pd 
import numpy as np
import re
import math
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vietv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [92]:
# Start session
session = requests.Session()

In [93]:
# Get login page to extract CSRF token; Ravelry requires user to login to view all page content
login_url = 'https://www.ravelry.com/account/login'
headers = {
    "User-Agent": "Mozilla/5.0",
}
login_page = session.get(login_url, headers=headers)
soup = BeautifulSoup(login_page.text, 'html.parser')

In [94]:
# Get CSRF token
csrf_token = soup.find('input', {'name': 'authenticity_token'})
if not csrf_token:
    raise Exception("CSRF token not found!")
csrf_value = csrf_token['value']

In [95]:
# Open file to retrieve login info, then prepare login payload with token
with open("login-info.txt") as f:
    username = f.readline().strip()
    password = f.readline().strip()
    #print(username, password)
    
payload = {
    'user[login]': username,
    'user[password]': password,
    'authenticity_token': csrf_value,
}


In [96]:
# Post to login
login_response = session.post(login_url, data=payload, headers=headers)

In [97]:
# Access projects page for pattern
url = "https://www.ravelry.com/patterns/library/agnete-cardigan/people?page=1&view=cards"
response1 = session.get(url, headers=headers)


In [98]:
# Parse the page
soup = BeautifulSoup(response1.text, "html.parser")
# Print page
print(soup.prettify())

<!DOCTYPE html>
<html class="" data-large-font="0" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   Ravelry: Project Gallery for Agnete Cardigan pattern by PetiteKnit
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="2CzaygpPGITtnFk3yH69cNZnLAdgQ9kTmOGaznGUmj4=" id="authenticity-token" name="authenticity-token"/>
  <meta content="origin" name="referrer"/>
  <meta content="noodp" name="robots"/>
  <meta content="Ravelry" name="application-name"/>
  <meta content="Ravelry" name="apple-mobile-web-app-title"/>
  <meta content="#191c27" name="theme-color"/>
  <link href="/manifest.webmanifest?v=20231002" rel="manifest"/>
  <link href="https://style-cdn.ravelrycache.com/stylesheets/ravelry_legacy_v1_2_2503131122.css" rel="Stylesheet" type="text/css">
   <link href="https://style-cdn.ravelrycache.com/stylesheets/ravelry_components_v1_2_2408121605.css" rel="Stylesheet" type="text/css"/>
   <script src="https://s

In [99]:
# Get number of projects
# num_projects is sometimes larger than number of projects shown on projects page (likely due to some users having project(s) but no post)
people_tab =soup.find("span", id="people_tab")
link_text = people_tab.get_text(strip=True)
match = re.search(r'\((\d+)\)', link_text)
num_projects = int(match.group(1))
# print(num_projects)

In [100]:
# Estimate number of pages based on number of projects, each page can hold a max of 32 project cards
num_pages = int(math.ceil(num_projects/32.0))
# print(num_pages)


In [101]:
# List to hold rows of each users' projects
rows = []
# Iterate through all pages
for i in range(1, num_pages + 1):
    url = f"https://www.ravelry.com/patterns/library/agnete-cardigan/people?page={i}&view=cards"
    response2 = session.get(url, headers=headers)
    soup = BeautifulSoup(response2.text, "html.parser")

    # Iterate through all every project card on each page
    for card in soup.find_all("div", class_="notebook_card__main"):
        # Get projects page url
        notebook_card__title  = card.find("div", class_="notebook_card__title")
        a_tag = notebook_card__title.find("a")
        url_temp = a_tag['href']

        # Get username
        username = a_tag.get_text(strip=True) if a_tag else ""
        username = username.split("'s")[0]

        # Get yarn info (name and colorway)
        yarn_name = card.find("div", class_="notebook_card__yarn_name")
        yarn_colorway = card.find("div", class_="notebook_card__yarn_colorway")

        # Get completion status and date
        notebook_card__status = card.find("div", class_="notebook_card__status")
        status = notebook_card__status.find("span")
        status_text = status.get_text(strip=True) if status else ""
        full_text = notebook_card__status.get_text(separator=" ", strip=True)
        date = full_text.replace(status_text, "").strip() if status_text else ""
        date = date.replace("\n", " ").strip()

        # Use project url to get full project notes
        response3 = session.get(url_temp, headers=headers)
        soup_notes = BeautifulSoup(response3.text, "html.parser")
        notes = soup_notes.find("div", class_="notes markdown core_item_content__text_block")
        paragraphs = notes.find_all("p")

        # Extract text from each paragraph
        note_texts = [p.get_text(strip=True) for p in paragraphs]
        full_project_notes = " ".join(note_texts)

        # Build a dictionary for each project
        row = {
            "username": username,
            "yarn_name": yarn_name.get_text(strip=True) if yarn_name else "",
            "yarn_colorway": yarn_colorway.get_text(strip=True) if yarn_colorway else "",
            "project_notes": full_project_notes,
            "status": status_text,
            "date": date,
            "url": url_temp
        }
        rows.append(row)

In [102]:
# Create dataframe and display head
df = pd.DataFrame(rows)
print(df.head())

        username                       yarn_name       yarn_colorway  \
0         amebou  Hillesvåg Ullvarefabrikk Sølje  2140 St Vet Ismint   
1  jooordangreen            Isager Yarn Alpaca 2            30 Black   
2     TTamaraToo             Lana Grossa Ecopuno                  56   
3      kristiiin                Alpakkagarn 100%                       
4  PixKnitAgathe                yarnbysimone KMS     Seulement noir​   

                                       project_notes    status      date  \
0                                                     Finished  May 2025   
1  First Cardigan - let’s see how this goes. Star...  Finished  May 2025   
2  When putting the buttonband stiches on hold, I...  Finished  May 2025   
3  I tvil siden strikkefastheten avviker ganske m...  Finished  May 2025   
4  Projet lancé lors deu kniteat 2024 avec les co...  Finished  May 2025   

                                                 url  
0  https://www.ravelry.com/projects/amebou/agnete...  


In [103]:
# Any missing data?
for i in df.columns:
    percent_missing = np.mean(df[i].isnull())
    print('{} - {}%'.format(i, percent_missing))

# Fill missing project notes with empty string
df['project_notes'] = df['project_notes'].fillna("")

username - 0.0%
yarn_name - 0.0%
yarn_colorway - 0.0%
project_notes - 0.0%
status - 0.0%
date - 0.0%
url - 0.0%


In [104]:
# Export as excel file; this can be loaded into a database later
df.to_excel("Agnete-Cardigan.xlsx")

In [105]:
# Read excel file to create df again; Faster than webscraping again in future
file_path = r'Agnete-Cardigan.xlsx' 
new_df = pd.read_excel(file_path)
print(new_df.head())

   Unnamed: 0       username                       yarn_name  \
0           0         amebou  Hillesvåg Ullvarefabrikk Sølje   
1           1  jooordangreen            Isager Yarn Alpaca 2   
2           2     TTamaraToo             Lana Grossa Ecopuno   
3           3      kristiiin                Alpakkagarn 100%   
4           4  PixKnitAgathe                yarnbysimone KMS   

        yarn_colorway                                      project_notes  \
0  2140 St Vet Ismint                                                NaN   
1            30 Black  First Cardigan - let’s see how this goes. Star...   
2                  56  When putting the buttonband stiches on hold, I...   
3                 NaN  I tvil siden strikkefastheten avviker ganske m...   
4     Seulement noir​  Projet lancé lors deu kniteat 2024 avec les co...   

     status      date                                                url  
0  Finished  May 2025  https://www.ravelry.com/projects/amebou/agnete...  
1  Finis

In [106]:
# NLP-based sentiment analysis using Robert De La Cruz's guide: 
# https://medium.com/@robdelacruz/sentiment-analysis-using-natural-language-processing-nlp-3c12b77a73ec
# Read in Amazon reviews dataset from excel file
Amazon_df = pd.read_csv('train.csv',names=['sentiment', 'title', 'review'])
print(Amazon_df.head())

# Any missing data?
for i in Amazon_df.columns:
    percent_missing = np.mean(Amazon_df[i].isnull())
    print('{} - {}%'.format(i, percent_missing))

# Replace missing title data with an empty string
Amazon_df['title'] = Amazon_df['title'].fillna("")

   sentiment                                  title  \
0          3                     more like funchuck   
1          5                              Inspiring   
2          5  The best soundtrack ever to anything.   
3          4                       Chrono Cross OST   
4          5                    Too good to be true   

                                              review  
0  Gave this to my dad for a gag gift after direc...  
1  I hope a lot of people hear this cd. We need m...  
2  I'm reading a lot of reviews saying that this ...  
3  The music of Yasunori Misuda is without questi...  
4  Probably the greatest soundtrack in history! U...  
sentiment - 0.0%
title - 6.266666666666667e-05%
review - 0.0%


In [107]:
# Define X and Y, replace star ratings with Negative, Neutral, and Positive sentiments
X = Amazon_df.title + ". " + Amazon_df.review
y = Amazon_df.sentiment.replace({
    1: 'Negative',
    2: 'Negative',
    3: 'Neutral',
    4: 'Positive',
    5: 'Positive'
})

In [108]:
# Train test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

# Function to replace numbers from the string
preprocessor = lambda text: re.sub(r'[^a-z ]', '', text.lower())

# Construct the pipeline with procedural steps to process data and cast predictions
pipe = Pipeline([
  ('vec', CountVectorizer(stop_words='english', min_df=1000, preprocessor=preprocessor)),
  ('tfid', TfidfTransformer()),
  ('lr', SGDClassifier(loss='log_loss'))
])

# Fit the model to data
model = pipe.fit(X_train, y_train)

In [109]:
# Predict sentiment on the test data
y_test_pred = model.predict(X_test)

# Create the classification report
report = classification_report(y_test, y_test_pred)
print(report)

              precision    recall  f1-score   support

    Negative       0.70      0.84      0.76    239551
     Neutral       0.58      0.10      0.17    120067
    Positive       0.70      0.85      0.77    240382

    accuracy                           0.70    600000
   macro avg       0.66      0.60      0.57    600000
weighted avg       0.68      0.70      0.65    600000



In [111]:
# Use model to make predictions on sentiment
new_df['prediction'] = model.predict(new_df['project_notes'])

new_df.to_excel("predictions.xlsx")