# Using Text Mining and Natural Language Processing to Evaluate Powerlifting Injuries: A Rapid Analysis of Current Studies

In [11]:
import requests
from bs4 import BeautifulSoup

# Function to scrape scientific articles from Google Scholar
def fetch_articles(query, num_articles=30):
    base_url = 'https://scholar.google.com/scholar'
    articles = []

    for i in range(0, num_articles, 10):  # Google Scholar shows 10 articles per page
        params = {
            'hl': 'en',
            'as_sdt': '0,5',
            'q': query,
            'start': i
        }
        response = requests.get(base_url, params=params)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all the search result divs
        result_divs = soup.find_all('div', class_='gs_ri')
        
        for div in result_divs:
            title_tag = div.find('h3', class_='gs_rt')
            snippet_tag = div.find('div', class_='gs_rs')
            
            if title_tag and snippet_tag:
                title = title_tag.text
                snippet = snippet_tag.text
                link = title_tag.find('a')['href'] if title_tag.find('a') else 'No link available'
                articles.append({'title': title, 'snippet': snippet, 'link': link})

    return articles

# Fetch articles related to powerlifting injuries
articles = fetch_articles('powerlifting injuries', num_articles=30)

# Print out the titles, snippets, and links of the articles
for article in articles:
    print(f"Title: {article['title']}\nSnippet: {article['snippet']}\nLink: {article['link']}\n")



Title: Prevalence and consequences of injuries in powerlifting: A cross-sectional study
Snippet: … In contrast to earlier studies on injuries in powerlifting, 1 the definition of an injury used in 
the present study was a condition of pain or impairment of bodily function that affected …
Link: https://journals.sagepub.com/doi/abs/10.1177/2325967118771016

Title: Injuries and overuse syndromes in powerlifting
Snippet: Powerlifting is a discipline of competitive weightlifting. To date, no investigations have focused 
on pain encountered during routine training. The aim of the study was to identify such pain…
Link: https://www.thieme-connect.com/products/ejournals/html/10.1055/s-0031-1277207

Title: Narrative review of injuries in powerlifting with special reference to their association to the squat, bench press and deadlift
Snippet: … the relationships between the powerlifting exercises and the specific injuries or movement 
… Such information could establish injury prevention strategies

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Lowercasing and stop-word removal
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(tokens)

# Preprocess the snippets of the articles
preprocessed_snippets = [preprocess_text(article['snippet']) for article in articles]


[nltk_data] Downloading package punkt to /Users/maxz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/maxz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(preprocessed_snippets)

# Topic Modeling
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X)

def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

display_topics(lda, vectorizer.get_feature_names_out(), 10)


Topic 0:
mass age rates injuries body study injury concerning compared focusing
Topic 1:
injury crossfit powerlifting studies published identify injuries included majority weightlifting
Topic 2:
weightlifting powerlifting techniques players coaches district pune olympic disciplines cited
Topic 3:
incidence training hours similar injury weightlifting findings shoulder suggest spine
Topic 4:
cord spinal injury one mansoni accident parasite schistosoma sci due
Topic 5:
number shown investigated population region site back greatest caused low
Topic 6:
injuries review chronic regarding powerlifting weightlifting focus types article use
Topic 7:
injuries muscle aoc co athletes prevalence evaluate current differ especially
Topic 8:
injuries powerlifting injury establish exercises could information relationships strategies prevention
Topic 9:
injury considered acute consist whereas comprehension etiology understanding discipline competition


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Ensure the manual labels are aligned with the number of preprocessed snippets subset
preprocessed_snippets_subset = preprocessed_snippets[:20]  # Assuming 20 samples for simplicity

# Example of manually labeled data with comprehensive labels (ensure it matches the subset length)
manual_labels = [
    'lower back injury', 'shoulder injury', 'knee injury', 'wrist injury', 'elbow injury',
    'hip injury', 'ankle injury', 'neck injury', 'foot injury', 'upper back injury',
    'lower back injury', 'shoulder injury', 'knee injury', 'wrist injury', 'elbow injury',
    'hip injury', 'ankle injury', 'neck injury', 'foot injury', 'upper back injury'
]

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(manual_labels)

# Ensure the length of encoded labels matches the preprocessed snippets
assert len(preprocessed_snippets_subset) == len(encoded_labels)

# Vectorize the preprocessed snippets
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(preprocessed_snippets_subset)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)

# Ensure that target names match the unique classes in the labels
target_names = label_encoder.classes_
print(classification_report(y_test, y_pred, target_names=target_names))


ValueError: Number of classes, 7, does not match size of target_names, 10. Try specifying the labels parameter