# Using Text Mining and Natural Language Processing to Evaluate Powerlifting Injuries: A Rapid Analysis of Current Studies

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Define specific keywords related to powerlifting injuries
keywords = [
    "lower back pain powerlifting",
    "shoulder injury powerlifting",
    "knee injury powerlifting",
    "wrist pain powerlifting",
    "elbow pain powerlifting",
    "hip injury powerlifting",
    "ankle injury powerlifting"
]

def fetch_articles(keyword, num_articles=30):
    base_url = 'https://scholar.google.com/scholar'
    articles = []

    for i in range(0, num_articles, 10):
        params = {
            'hl': 'en',
            'as_sdt': '0,5',
            'q': keyword,
            'start': i
        }
        response = requests.get(base_url, params=params)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the search result divs
        result_divs = soup.find_all('div', class_='gs_ri')

        for div in result_divs:
            title_tag = div.find('h3', class_='gs_rt')
            snippet_tag = div.find('div', class_='gs_rs')
            link_tag = title_tag.find('a') if title_tag else None

            if title_tag and snippet_tag:
                title = title_tag.text
                snippet = snippet_tag.text
                link = link_tag['href'] if link_tag else 'No link available'
                articles.append({'title': title, 'snippet': snippet, 'link': link})

    return articles

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\[.*?\]', '', text)  # Remove citations
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower().strip()  # Convert to lowercase and strip whitespace
    return text

# Fetch and preprocess articles
all_articles = []
for keyword in keywords:
    articles = fetch_articles(keyword)
    all_articles.extend(articles)

# Filter out any articles that don't have the required keys
filtered_articles = [article for article in all_articles if 'title' in article and 'snippet' in article and 'link' in article]

# Create DataFrame
df = pd.DataFrame(filtered_articles)
df['snippet'] = df['snippet'].apply(clean_text)

# Save raw data for further inspection and annotation
df.to_csv('powerlifting_injuries.csv', index=False)

print("Data collection complete.")


Data collection complete.
