Skip to content

Commit

Permalink
Improved document tracking in deep keyword analysis
Browse files Browse the repository at this point in the history
Keyword analysis report now includes URLs for each keyword term.
  • Loading branch information
nazuke committed Mar 16, 2017
1 parent 70596e2 commit 5f3d327
Show file tree
Hide file tree
Showing 7 changed files with 275 additions and 32 deletions.
109 changes: 104 additions & 5 deletions MacroscopeAnalysis/MacroscopeDeepKeywordAnalysis.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ public class MacroscopeDeepKeywordAnalysis : Macroscope

/**************************************************************************/

Dictionary<string,string> DocList;

// Keyword Term / MacroscopeDocumentList
Dictionary<string,MacroscopeDocumentList> DocList;

/**************************************************************************/

public MacroscopeDeepKeywordAnalysis ()
Expand All @@ -48,7 +49,7 @@ public MacroscopeDeepKeywordAnalysis ()
this.DocList = null;
}

public MacroscopeDeepKeywordAnalysis ( Dictionary<string,string> DocList )
public MacroscopeDeepKeywordAnalysis ( Dictionary<string,MacroscopeDocumentList> DocList )
{
this.SuppressDebugMsg = true;
this.DocList = DocList;
Expand All @@ -62,6 +63,7 @@ public MacroscopeDeepKeywordAnalysis ( Dictionary<string,string> DocList )
int Words
)
{

if( Words == 1 )
{
this.AnalyzeTerm(
Expand All @@ -78,25 +80,93 @@ int Words
Words: Words
);
}

}

public void Analyze (
MacroscopeDocument msDoc,
string Text,
Dictionary<string,int> Terms,
int Words
)
{

Dictionary<string,int> TermsList = null;

if( Words == 1 )
{
TermsList = this.AnalyzeTerm(
Text: Text,
Terms: Terms
);
}
else
if( Words > 1 )
{
TermsList = this.AnalyzePhrase(
Text: Text,
Terms: Terms,
Words: Words
);
}

if( ( this.DocList != null ) && ( TermsList != null ) )
{

lock( this.DocList )
{

foreach( string KeywordTerm in TermsList.Keys )
{

MacroscopeDocumentList DocumentList;

if( this.DocList.ContainsKey( KeywordTerm ) )
{
DocumentList = this.DocList[ KeywordTerm ];
}
else
{
DocumentList = new MacroscopeDocumentList ();
this.DocList.Add( KeywordTerm, DocumentList );
}

DocumentList.AddDocument( msDoc );

}

}

}

}

/** Analyze 1 Word ********************************************************/

private void AnalyzeTerm (
private Dictionary<string,int> AnalyzeTerm (
string Text,
Dictionary<string,int> Terms
)
{

Dictionary<string,int> TermsList = new Dictionary<string,int> ();

if( Text.Length > 0 )
{

string [] Chunks = Text.Split( ' ' );

if( Chunks.Length > 0 )
{

for( int i = 0 ; i < Chunks.Length ; i++ )
{

string sTerm = Chunks[ i ];

if( sTerm.Length > 0 )
{

if( Terms.ContainsKey( sTerm ) )
{
Terms[ sTerm ] += 1;
Expand All @@ -105,21 +175,39 @@ int Words
{
Terms.Add( sTerm, 1 );
}

if( TermsList.ContainsKey( sTerm ) )
{
TermsList[ sTerm ] += 1;
}
else
{
TermsList.Add( sTerm, 1 );
}

}

}

}

}

return( TermsList );

}

/** Analyze Multi-Word Phrases ********************************************/

private void AnalyzePhrase (
private Dictionary<string,int> AnalyzePhrase (
string Text,
Dictionary<string,int> Terms,
int Words
)
{

Dictionary<string,int> TermsList = new Dictionary<string,int> ();

if( Text.Length > 0 )
{

Expand Down Expand Up @@ -159,6 +247,15 @@ int Words
Terms.Add( sTerm, 1 );
}

if( TermsList.ContainsKey( sTerm ) )
{
TermsList[ sTerm ] += 1;
}
else
{
TermsList.Add( sTerm, 1 );
}

}

}
Expand All @@ -169,6 +266,8 @@ int Words

}

return( TermsList );

}

/**************************************************************************/
Expand Down
45 changes: 24 additions & 21 deletions MacroscopeDocumentCollection/MacroscopeDocumentCollection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public sealed class MacroscopeDocumentCollection : Macroscope

private Dictionary<string,decimal> StatsDurations;

private Dictionary<string,string> StatsDeepKeywordAnalysisDocs;
private Dictionary<string,MacroscopeDocumentList> StatsDeepKeywordAnalysisDocs;
private List<Dictionary<string,int>> StatsDeepKeywordAnalysis;

private int StatsUrlsInternal;
Expand Down Expand Up @@ -87,8 +87,6 @@ public MacroscopeDocumentCollection ( MacroscopeJobMaster JobMaster )

this.SearchIndex = new MacroscopeSearchIndex ();

this.AnalyzeKeywords = new MacroscopeDeepKeywordAnalysis ();

this.StructInlinks = new Dictionary<string,MacroscopeInlink> ( 1024 );
this.StructHyperlinksIn = new Dictionary<string,MacroscopeHyperlinksIn> ( 1024 );

Expand All @@ -103,13 +101,15 @@ public MacroscopeDocumentCollection ( MacroscopeJobMaster JobMaster )

this.StatsDurations = new Dictionary<string,decimal> ( 1024 );

this.StatsDeepKeywordAnalysisDocs = new Dictionary<string,string> ( 1024 );
this.StatsDeepKeywordAnalysisDocs = new Dictionary<string,MacroscopeDocumentList> ( 1024 );
this.StatsDeepKeywordAnalysis = new List<Dictionary<string,int>> ( 4 );
for( int i = 0 ; i <= 3 ; i++ )
{
this.StatsDeepKeywordAnalysis.Add( new Dictionary<string,int> ( 1024 ) );
}

this.AnalyzeKeywords = new MacroscopeDeepKeywordAnalysis ( DocList: this.StatsDeepKeywordAnalysisDocs );

this.StatsUrlsInternal = 0;
this.StatsUrlsExternal = 0;
this.StatsUrlsSitemaps = 0;
Expand Down Expand Up @@ -1205,6 +1205,10 @@ private void ClearStatsDeepKeywordAnalysis ()
this.StatsDeepKeywordAnalysis[ i ].Clear();
}
}
lock( this.StatsDeepKeywordAnalysisDocs )
{
this.StatsDeepKeywordAnalysisDocs.Clear();
}
}
}

Expand All @@ -1230,22 +1234,18 @@ private void RecalculateStatsDeepKeywordAnalysis ( MacroscopeDocument msDoc )

if( sLang != null )
{
DebugMsg( string.Format( "RecalculateStatsDeepKeywordAnalysis: GetLang {0}", msDoc.GetLang() ) );
if( Regex.IsMatch( msDoc.GetLang(), "^(x-default|en|fr|de|it|es|po)", RegexOptions.IgnoreCase ) )
{
lock( this.StatsDeepKeywordAnalysis )
{
for( int i = 0 ; i <= 3 ; i++ )
{
this.AnalyzeKeywords.Analyze(
msDoc: msDoc,
Text: msDoc.GetBodyText(),
Terms: this.StatsDeepKeywordAnalysis[ i ],
Words: i + 1
);


//Dictionary<string,string> StatsDeepKeywordAnalysisDocs;

}
}
}
Expand All @@ -1259,19 +1259,8 @@ private void RecalculateStatsDeepKeywordAnalysis ( MacroscopeDocument msDoc )
{

int iWordsOffset = Words - 1;

DebugMsg( string.Format( "GetDeepKeywordAnalysisAsDictonary: Words: {0}", Words ) );
DebugMsg( string.Format( "GetDeepKeywordAnalysisAsDictonary: iWordsOffset: {0}", iWordsOffset ) );

DebugMsg( string.Format( "GetDeepKeywordAnalysisAsDictonary: this.StatsDeepKeywordAnalysis: {0}", this.StatsDeepKeywordAnalysis[ iWordsOffset ].Count ) );


Dictionary<string,int> Terms = new Dictionary<string,int> ( this.StatsDeepKeywordAnalysis[ iWordsOffset ].Count );


DebugMsg( string.Format( "GetDeepKeywordAnalysisAsDictonary: Terms: {0}", Terms.Count ) );



lock( this.StatsDeepKeywordAnalysis[iWordsOffset] )
{
foreach( string sTerm in this.StatsDeepKeywordAnalysis[iWordsOffset].Keys )
Expand All @@ -1284,6 +1273,20 @@ private void RecalculateStatsDeepKeywordAnalysis ( MacroscopeDocument msDoc )

}

public MacroscopeDocumentList GetDeepKeywordAnalysDocumentList ( string KeywordTerm )
{

MacroscopeDocumentList DocumentList = null;

if( this.StatsDeepKeywordAnalysisDocs.ContainsKey( KeywordTerm ) )
{
DocumentList = this.StatsDeepKeywordAnalysisDocs[ KeywordTerm ];
}

return( DocumentList );

}

/** Search Index **********************************************************/

public MacroscopeSearchIndex GetSearchIndex ()
Expand Down

1 comment on commit 5f3d327

@nazuke
Copy link
Owner Author

@nazuke nazuke commented on 5f3d327 Mar 16, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seemed to increase RAM usage considerably.

Please sign in to comment.