In [None]:
# Load all sheets from the uploaded Excel file to examine its contents
file_path = 'Relevant List from R1 and R2.xlsx'
dataframes = read_all_sheets_from_excel(file_path)

# Print the names of the sheets and the first few rows of each to understand their structure
for sheet_name, df in dataframes.items():
    print('Sheet name:', sheet_name)
    print(df.head())
    print('-' * 80)

# Load the 'Relevant36' sheet and inspect the first few rows to understand its structure
df_relevant36 = dataframes['Relevant36']

# Check the first few rows to determine how to set the correct header
print(df_relevant36.head())

# Perform TF-IDF analysis on the 'Relevant36' DataFrame, assuming the 'Abstract' column contains the text data
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Extract the 'Abstract' column
texts = df_relevant36['Abstract'].dropna().astype(str).tolist()

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=20)

# Fit and transform the texts
tfidf_matrix = vectorizer.fit_transform(texts)

# Get feature names
features = vectorizer.get_feature_names_out()

# Create a DataFrame for the TF-IDF scores
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=features)

# Show the top features with the highest TF-IDF scores for each document
top_features = tfidf_df.idxmax(axis=1)
top_scores = tfidf_df.max(axis=1)

# Combine into a summary DataFrame
summary_df = pd.DataFrame({'Top Feature': top_features, 'Score': top_scores})

# Display the first few rows of the summary
print(summary_df.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Use the TF-IDF scores DataFrame to plot the distribution of scores
plt.figure(figsize=(10, 6))
sns.histplot(tfidf_df.values.flatten(), bins=30, kde=True)
plt.title('Distribution of TF-IDF Scores')
plt.xlabel('TF-IDF Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Assuming we have a column that indicates document type, for example 'Type'
# First, check if such a column exists
print(df_relevant36.columns)

# If it exists, we will group by this column and analyze TF-IDF scores
# For demonstration, let's assume the column is named 'Type'
# If the column doesn't exist, we need to identify the correct column or create one

# Check unique values in the 'Type' column
if 'Type' in df_relevant36.columns:
    print(df_relevant36['Type'].value_counts())
else:
    print('No column named "Type" found. Please specify the correct column for document type.')

In [None]:
# Define categories and associated keywords
categories_keywords = {
    'Education': ['education', 'learning', 'teacher', 'student', 'pedagogy', 'assessment', 'evaluation', 'blended learning', 'online education', 'distance learning', 'remote teaching'],
    'Social Media': ['social media', 'WeChat', 'Twitter', 'altmetrics', 'discourse analysis'],
    'Policy': ['policy', 'implementation', 'administrators', 'instructional coherence'],
    'Technology': ['machine learning', 'neural network', 'deep learning', 'big data', 'data mining', 'algorithm', 'support vector machine', 'classification', 'clustering'],
    'Other': []
}

# Function to assign category based on keywords
import numpy as np
def assign_category(text):
    text_lower = text.lower()
    for category, keywords in categories_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            return category
    return 'Other'

# Apply the function to create a new column 'Category'
df_relevant36['Category'] = df_relevant36['Abstract'].astype(str).apply(assign_category)

# Check the distribution of categories
category_counts = df_relevant36['Category'].value_counts()
print(category_counts)

# Now, for each category, plot the TF-IDF score distribution
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
for category in category_counts.index:
    # Filter TF-IDF scores for documents in this category
    texts_in_category = df_relevant36[df_relevant36['Category'] == category]['Abstract'].dropna().astype(str).tolist()
    tfidf_matrix_cat = vectorizer.transform(texts_in_category)
    scores = tfidf_matrix_cat.toarray().flatten()
    sns.histplot(scores, bins=30, kde=True, label=category, alpha=0.6)

plt.title('TF-IDF Score Distribution by Document Category')
plt.xlabel('TF-IDF Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Summarize the top features and their scores
top_features = {
    'learning': [0.8703, 0.7311],
    'performance': [0.8705, 0.6915, 0.5635, 0.4927],
    'evaluation': [0.8395, 0.7067],
    'media': [0.7559, 0.6942, 0.6259, 0.4686],
    'online': [0.8756, 0.6530, 0.5468],
    'social': [0.7898, 0.7938, 0.6939, 0.6103],
    'quality': [0.7826, 0.5763, 0.5432],
    'teachers': [0.4408],
    'network': [0.7733, 0.5257],
    'teaching': [0.8106, 0.6488, 0.6135],
    'data': [0.7665, 0.6172, 0.5337, 0.6648]
}

# Convert to a DataFrame for visualization
features_list = []
scores_list = []
for feature, scores in top_features.items():
    for score in scores:
        features_list.append(feature)
        scores_list.append(score)

df_scores = pd.DataFrame({'Feature': features_list, 'Score': scores_list})

# Plot the distribution of scores for top features
plt.figure(figsize=(12, 6))
sns.boxplot(x='Feature', y='Score', data=df_scores)
plt.title('Distribution of TF-IDF Scores for Top Features')
plt.xticks(rotation=45)
plt.show()